3 import jalview.api.AlignExportSettingI;
4 import jalview.api.AlignmentViewPanel;
5 import jalview.datamodel.HMMNode;
6 import jalview.datamodel.HiddenMarkovModel;
7 import jalview.datamodel.SequenceI;
9 import java.io.BufferedReader;
10 import java.io.IOException;
11 import java.util.ArrayList;
12 import java.util.List;
13 import java.util.Scanner;
17 * Adds capability to read in and write out HMMER3 files. .
23 public class HMMFile extends AlignFile
24 implements AlignmentFileReaderI, AlignmentFileWriterI
26 private static final String TERMINATOR = "//";
29 * keys to data in HMM file, used to store as properties of the HiddenMarkovModel
31 public static final String HMM = "HMM";
33 public static final String NAME = "NAME";
35 public static final String ACCESSION_NUMBER = "ACC";
37 public static final String DESCRIPTION = "DESC";
39 public static final String LENGTH = "LENG";
41 public static final String MAX_LENGTH = "MAXL";
43 public static final String ALPHABET = "ALPH";
45 public static final String DATE = "DATE";
47 public static final String COMMAND_LOG = "COM";
49 public static final String NUMBER_OF_SEQUENCES = "NSEQ";
51 public static final String EFF_NUMBER_OF_SEQUENCES = "EFFN";
53 public static final String CHECK_SUM = "CKSUM";
55 public static final String STATISTICS = "STATS";
57 public static final String COMPO = "COMPO";
59 public static final String GATHERING_THRESHOLD = "GA";
61 public static final String TRUSTED_CUTOFF = "TC";
63 public static final String NOISE_CUTOFF = "NC";
65 public static final String VITERBI = "VITERBI";
67 public static final String MSV = "MSV";
69 public static final String FORWARD = "FORWARD";
71 public static final String MAP = "MAP";
73 public static final String REFERENCE_ANNOTATION = "RF";
75 public static final String CONSENSUS_RESIDUE = "CONS";
77 public static final String CONSENSUS_STRUCTURE = "CS";
79 public static final String MASKED_VALUE = "MM";
81 private static final String ALPH_AMINO = "amino";
83 private static final String ALPH_DNA = "DNA";
85 private static final String ALPH_RNA = "RNA";
87 private static final String ALPHABET_AMINO = "ACDEFGHIKLMNPQRSTVWY";
89 private static final String ALPHABET_DNA = "ACGT";
91 private static final String ALPHABET_RNA = "ACGU";
93 private static final int NUMBER_OF_TRANSITIONS = 7;
95 private static final String SPACE = " ";
98 * optional guide line added to an output HMMER file, purely for readability
100 private static final String TRANSITIONTYPELINE = " m->m m->i m->d i->m i->i d->m d->d";
102 private static String NL = System.lineSeparator();
104 private HiddenMarkovModel hmm;
106 // number of symbols in the alphabet used in the hidden Markov model
107 private int numberOfSymbols;
110 * Constructor that parses immediately
114 * @throws IOException
116 public HMMFile(String inFile, DataSourceType type) throws IOException
122 * Constructor that parses immediately
125 * @throws IOException
127 public HMMFile(FileParse source) throws IOException
133 * Default constructor
140 * Constructor for HMMFile used for exporting
144 public HMMFile(HiddenMarkovModel markov)
150 * Returns the HMM produced by parsing a HMMER3 file
154 public HiddenMarkovModel getHMM()
160 * Gets the name of the hidden Markov model
164 public String getName()
166 return hmm.getName();
170 * Reads the data from HMM file into the HMM model
177 hmm = new HiddenMarkovModel();
178 parseHeaderLines(dataIn);
180 } catch (Exception e)
187 * Reads the header properties from a HMMER3 file and saves them in the
188 * HiddeMarkovModel. This method exits after reading the next line after the
192 * @throws IOException
194 void parseHeaderLines(BufferedReader input) throws IOException
196 boolean readingHeaders = true;
197 hmm.setFileHeader(input.readLine());
198 String line = input.readLine();
199 while (readingHeaders && line != null)
201 Scanner parser = new Scanner(line);
202 String next = parser.next();
203 if (ALPHABET.equals(next))
205 String alphabetType = parser.next();
206 hmm.setProperty(ALPHABET, alphabetType);
207 String alphabet = ALPH_DNA.equalsIgnoreCase(alphabetType)
209 : (ALPH_RNA.equalsIgnoreCase(alphabetType) ? ALPHABET_RNA
211 numberOfSymbols = hmm.setAlphabet(alphabet);
213 else if (HMM.equals(next))
215 readingHeaders = false;
216 String symbols = line.substring(line.indexOf(HMM) + HMM.length());
217 numberOfSymbols = hmm.setAlphabet(symbols);
219 else if (STATISTICS.equals(next))
225 value = parser.next() + SPACE + SPACE + parser.next();
226 hmm.setProperty(key, value);
231 String value = parser.next();
232 while (parser.hasNext())
234 value = value + SPACE + parser.next();
236 hmm.setProperty(key, value);
239 line = input.readLine();
244 * Parses the model data from the HMMER3 file. The input buffer should be
245 * positioned at the (optional) COMPO line if there is one, else at the insert
246 * emissions line for the BEGIN node of the model.
249 * @throws IOException
251 void parseModel(BufferedReader input) throws IOException
254 * specification says there must always be an HMM header (already read)
255 * and one more header (guide headings) which is skipped here
258 String line = input.readLine();
259 List<HMMNode> nodes = new ArrayList<>();
261 while (line != null && !TERMINATOR.equals(line))
263 HMMNode node = new HMMNode();
265 Scanner scanner = new Scanner(line);
266 String next = scanner.next();
269 * expect COMPO (optional) for average match emissions
270 * or a node number followed by node's match emissions
272 if (COMPO.equals(next) || nodeNo > 0)
275 * parse match emissions
277 double[] matches = parseDoubles(scanner, numberOfSymbols);
278 node.setMatchEmissions(matches);
279 if (!COMPO.equals(next))
281 int resNo = parseAnnotations(scanner, node);
285 * no MAP annotation provided, just number off from 0 (begin node)
289 node.setResidueNumber(resNo);
291 line = input.readLine();
296 * parse insert emissions
298 scanner = new Scanner(line);
299 double[] inserts = parseDoubles(scanner, numberOfSymbols);
300 node.setInsertEmissions(inserts);
304 * parse state transitions
306 line = input.readLine();
307 scanner = new Scanner(line);
308 double[] transitions = parseDoubles(scanner,
309 NUMBER_OF_TRANSITIONS);
310 node.setStateTransitions(transitions);
312 line = input.readLine();
321 * Parses the annotations on the match emission line and add them to the node.
322 * (See p109 of the HMMER User Guide (V3.1b2) for the specification.) Returns
323 * the residue position that the node maps to, if provided, else zero.
328 int parseAnnotations(Scanner scanner, HMMNode node)
333 * map from hmm node to sequence position, if provided
335 if (scanner.hasNext())
337 String value = scanner.next();
338 if (!"-".equals(value))
342 mapTo = Integer.parseInt(value);
343 node.setResidueNumber(mapTo);
344 } catch (NumberFormatException e)
352 * hmm consensus residue if provided, else '-'
354 if (scanner.hasNext())
356 node.setConsensusResidue(scanner.next().charAt(0));
360 * RF reference annotation, if provided, else '-'
362 if (scanner.hasNext())
364 node.setReferenceAnnotation(scanner.next().charAt(0));
368 * 'm' for masked position, if provided, else '-'
370 if (scanner.hasNext())
372 node.setMaskValue(scanner.next().charAt(0));
376 * structure consensus symbol, if provided, else '-'
378 if (scanner.hasNext())
380 node.setConsensusStructure(scanner.next().charAt(0));
387 * Fills an array of doubles parsed from an input line
390 * @param numberOfElements
392 * @throws IOException
394 static double[] parseDoubles(Scanner input,
395 int numberOfElements) throws IOException
397 double[] values = new double[numberOfElements];
398 for (int i = 0; i < numberOfElements; i++)
400 if (!input.hasNext())
402 throw new IOException("Incomplete data");
404 String next = input.next();
405 if (next.contains("*"))
407 values[i] = Double.NEGATIVE_INFINITY;
411 double prob = Double.valueOf(next);
412 prob = Math.pow(Math.E, -prob);
420 * Returns a string to be added to the StringBuilder containing the entire
423 * @param initialColumnSeparation
424 * The initial whitespace separation between the left side of the
425 * file and first character.
426 * @param columnSeparation
427 * The separation between subsequent data entries.
429 * The list of data to be added to the String.
432 String addData(int initialColumnSeparation,
433 int columnSeparation, List<String> data)
436 boolean first = true;
437 for (String value : data)
439 int sep = first ? initialColumnSeparation : columnSeparation;
440 line += String.format("%" + sep + "s", value);
447 * Converts list of characters into a list of Strings.
450 * @return Returns the list of Strings.
452 List<String> charListToStringList(List<Character> list)
454 List<String> strList = new ArrayList<>();
455 for (char value : list)
457 String strValue = Character.toString(value);
458 strList.add(strValue);
464 * Converts an array of doubles into a list of Strings, rounded to the nearest
468 * @param noOfDecimals
471 List<String> doublesToStringList(double[] doubles)
473 List<String> strList = new ArrayList<>();
474 for (double value : doubles)
479 strValue = String.format("%.5f", value);
481 else if (value == -0.00000d)
483 strValue = "0.00000";
489 strList.add(strValue);
495 * Appends model data in string format to the string builder
499 void appendModelAsString(StringBuilder output)
501 output.append(HMM).append(" ");
502 String charSymbols = hmm.getSymbols();
503 for (char c : charSymbols.toCharArray())
505 output.append(String.format("%9s", c));
507 output.append(NL).append(TRANSITIONTYPELINE);
509 int length = hmm.getLength();
511 for (int nodeNo = 0; nodeNo <= length; nodeNo++)
513 String matchLine = String.format("%7s",
514 nodeNo == 0 ? COMPO : Integer.toString(nodeNo));
516 double[] doubleMatches = convertToLogSpace(
517 hmm.getNode(nodeNo).getMatchEmissions());
518 List<String> strMatches = doublesToStringList(doubleMatches);
519 matchLine += addData(10, 9, strMatches);
523 matchLine += SPACE + (hmm.getNodeMapPosition(nodeNo));
524 matchLine += SPACE + hmm.getConsensusResidue(nodeNo);
525 matchLine += SPACE + hmm.getReferenceAnnotation(nodeNo);
526 if (hmm.getFileHeader().contains("HMMER3/f"))
528 matchLine += SPACE + hmm.getMaskedValue(nodeNo);
529 matchLine += SPACE + hmm.getConsensusStructure(nodeNo);
533 output.append(NL).append(matchLine);
535 String insertLine = "";
537 double[] doubleInserts = convertToLogSpace(
538 hmm.getNode(nodeNo).getInsertEmissions());
539 List<String> strInserts = doublesToStringList(doubleInserts);
540 insertLine += addData(17, 9, strInserts);
542 output.append(NL).append(insertLine);
544 String transitionLine = "";
545 double[] doubleTransitions = convertToLogSpace(
546 hmm.getNode(nodeNo).getStateTransitions());
547 List<String> strTransitions = doublesToStringList(
549 transitionLine += addData(17, 9, strTransitions);
551 output.append(NL).append(transitionLine);
556 * Appends formatted HMM file properties to the string builder
560 void appendProperties(StringBuilder output)
562 output.append(hmm.getFileHeader());
564 String format = "%n%-5s %1s";
565 appendProperty(output, format, NAME);
566 appendProperty(output, format, ACCESSION_NUMBER);
567 appendProperty(output, format, DESCRIPTION);
568 appendProperty(output, format, LENGTH);
569 appendProperty(output, format, MAX_LENGTH);
570 appendProperty(output, format, ALPHABET);
571 appendBooleanProperty(output, format, REFERENCE_ANNOTATION);
572 appendBooleanProperty(output, format, MASKED_VALUE);
573 appendBooleanProperty(output, format, CONSENSUS_RESIDUE);
574 appendBooleanProperty(output, format, CONSENSUS_STRUCTURE);
575 appendBooleanProperty(output, format, MAP);
576 appendProperty(output, format, DATE);
577 appendProperty(output, format, NUMBER_OF_SEQUENCES);
578 appendProperty(output, format, EFF_NUMBER_OF_SEQUENCES);
579 appendProperty(output, format, CHECK_SUM);
580 appendProperty(output, format, GATHERING_THRESHOLD);
581 appendProperty(output, format, TRUSTED_CUTOFF);
582 appendProperty(output, format, NOISE_CUTOFF);
584 if (hmm.getMSV() != null)
586 format = "%n%-19s %18s";
587 output.append(String.format(format, "STATS LOCAL MSV", hmm.getMSV()));
589 output.append(String.format(format, "STATS LOCAL VITERBI",
592 output.append(String.format(format, "STATS LOCAL FORWARD",
598 * Appends 'yes' or 'no' for the given property, according to whether or not
599 * it is set in the HMM
603 * @param propertyName
605 private void appendBooleanProperty(StringBuilder output, String format,
608 boolean set = hmm.getBooleanProperty(propertyName);
609 output.append(String.format(format, propertyName,
610 set ? HiddenMarkovModel.YES : HiddenMarkovModel.NO));
614 * Appends the value of the given property to the output, if not null
618 * @param propertyName
620 private void appendProperty(StringBuilder output, String format,
623 String value = hmm.getProperty(propertyName);
626 output.append(String.format(format, propertyName, value));
631 public String print(SequenceI[] sequences, boolean jvsuffix)
633 if (sequences[0].getHMM() != null)
635 hmm = sequences[0].getHMM();
641 * Prints the .hmm file to a String.
645 public String print()
647 StringBuilder output = new StringBuilder();
648 appendProperties(output);
650 appendModelAsString(output);
651 output.append(NL).append(TERMINATOR).append(NL);
652 return output.toString();
656 * Converts the probabilities contained in an array into log space
660 double[] convertToLogSpace(double[] ds)
662 double[] converted = new double[ds.length];
663 for (int i = 0; i < ds.length; i++)
666 double logProb = -1 * Math.log(prob);
668 converted[i] = logProb;
674 * Returns the HMM sequence produced by reading a .hmm file.
677 public SequenceI[] getSeqsAsArray()
679 SequenceI hmmSeq = hmm.getConsensusSequence();
680 SequenceI[] seq = new SequenceI[1];
686 public void setNewlineString(String newLine)
692 public void setExportSettings(AlignExportSettingI exportSettings)
698 public void configureForView(AlignmentViewPanel viewpanel)
704 public boolean hasWarningMessage()
710 public String getWarningMessage()
712 return "warning message";