3 import jalview.datamodel.HMMNode;
4 import jalview.datamodel.HiddenMarkovModel;
5 import jalview.datamodel.SequenceI;
7 import java.io.BufferedReader;
8 import java.io.FileNotFoundException;
9 import java.io.IOException;
10 import java.io.PrintWriter;
11 import java.io.UnsupportedEncodingException;
12 import java.util.ArrayList;
13 import java.util.List;
14 import java.util.Scanner;
18 * Adds capability to read in and write out HMMER3 files. Currently only supports HMMER3/f.
24 public class HMMFile extends AlignFile
25 implements AlignmentFileReaderI, AlignmentFileWriterI
27 // HMM to store file data
28 private HiddenMarkovModel hmm = new HiddenMarkovModel();
30 // number of possible transitions
31 private final int NUMBER_OF_TRANSITIONS = 7;
33 private final String NEW_LINE = "\n";
35 //number of symbols in the alphabet used in the hidden Markov model
38 private final String SPACE = " ";
40 private final String COMPO = "COMPO";
42 private final String EMPTY = "";
44 //This is a line that needs to be added to each HMMER£ file. It is purely for readability.
45 private static final String TRANSITIONTYPELINE = "m->m m->i m->d i->m i->i d->m d->d";
48 * Constructor for HMMFile, parses immediately
53 public HMMFile(FileParse source) throws IOException
60 * Default constructor, do not use!
68 * Constructor for HMMFile used for exporting.
71 * @param exportImmediately
73 public HMMFile(HiddenMarkovModel markov)
79 * Returns the HMM produced by reading in a HMMER3 file.
83 public HiddenMarkovModel getHMM()
89 * Sets the HMM used in this file.
93 public void setHMM(HiddenMarkovModel model)
99 * Gets the name of the hidden Markov model.
103 public String getName()
105 return hmm.getName();
109 * Reads the data from HMM file into the HMM field on this object.
111 * @throws IOException
114 public void parse() throws IOException
116 parseFileProperties(dataIn);
121 * Reads the data from HMM file into the HMM field on this object.
123 * @throws IOException
126 public void parse(BufferedReader br) throws IOException
128 parseFileProperties(br);
135 * Imports the file properties from a HMMER3 file.
138 * The buffered reader used to read in the file.
139 * @throws IOException
141 void parseFileProperties(BufferedReader input) throws IOException
143 boolean readingFile = true;
144 hmm.setFileHeader(input.readLine());
145 String line = input.readLine();
150 Scanner parser = new Scanner(line);
151 String next = parser.next();
152 if ("HMM".equals(next)) // indicates start of HMM data (end of file
156 hmm.fillSymbols(parser);
157 numberOfSymbols = hmm.getNumberOfSymbols();
159 else if ("STATS".equals(next))
165 value = parser.next() + SPACE + SPACE + parser.next();
166 hmm.addFileProperty(key, value);
171 String value = parser.next();
172 while (parser.hasNext())
174 value = value + SPACE + parser.next();
176 hmm.addFileProperty(key, value);
180 line = input.readLine();
190 * Parses the model data from the HMMER3 file
193 * The buffered reader used to read the file.
194 * @throws IOException
196 void parseModel(BufferedReader input) throws IOException
198 String line = input.readLine();
200 while (!"//".equals(line))
202 hmm.getNodes().add(new HMMNode());
204 Scanner matchReader = new Scanner(line);
205 next = matchReader.next();
206 if (next.equals(COMPO) || node > 0)
208 // stores match emission line in list
209 List<Double> matches = new ArrayList<>();
210 matches = fillList(matchReader, numberOfSymbols);
211 hmm.getNodes().get(node).setMatchEmissions(matches);
214 parseAnnotations(matchReader, node);
218 // stores insert emission line in list
219 line = input.readLine();
220 Scanner insertReader = new Scanner(line);
221 List<Double> inserts = new ArrayList<>();
222 inserts = fillList(insertReader, numberOfSymbols);
223 hmm.getNodes().get(node).setInsertEmissions(inserts);
224 insertReader.close();
226 // stores state transition line in list
227 line = input.readLine();
228 Scanner transitionReader = new Scanner(line);
229 List<Double> transitions = new ArrayList<>();
230 transitions = fillList(transitionReader, NUMBER_OF_TRANSITIONS);
231 hmm.getNodes().get(node).setStateTransitions(transitions);
232 transitionReader.close();
233 line = input.readLine();
240 * Parses the annotations on the match emission line.
243 * The scanner which is processing match emission line.
245 * The index of node which is being scanned.
247 void parseAnnotations(Scanner scanner, int index)
249 if (hmm.mapIsActive())
252 column = scanner.nextInt();
253 hmm.getNodes().get(index).setAlignmentColumn(column);
254 hmm.getNodeLookup().put(column, index);
261 if (scanner.hasNext())
264 consensusR = charValue(scanner.next());
265 hmm.getNodes().get(index).setConsensusResidue(consensusR);
268 if (scanner.hasNext())
271 reference = charValue(scanner.next());
272 hmm.getNodes().get(index).setReferenceAnnotation(reference);
275 if (scanner.hasNext())
278 value = charValue(scanner.next());
279 hmm.getNodes().get(index).setMaskValue(value);
281 if (scanner.hasNext())
284 consensusS = charValue(scanner.next());
285 hmm.getNodes().get(index).setConsensusStructure(consensusS);
292 * Fills a list of doubles based on an input line.
295 * The scanner for the line containing the data to be transferred to
297 * @param numberOfElements
298 * The number of elements in the list to be filled.
299 * @return filled list Returns the list of doubles.
301 static List<Double> fillList(Scanner input,
302 int numberOfElements)
304 List<Double> list = new ArrayList<>();
305 for (int i = 0; i < numberOfElements; i++)
308 String next = input.next();
309 if (next.contains("*")) // state transitions to or from delete states
310 // occasionally have values of -infinity. These
311 // values are represented by an * in the .hmm
314 list.add(Double.NEGATIVE_INFINITY);
318 double prob = Double.valueOf(next);
319 prob = Math.pow(Math.E, -prob);
328 * Writes a HMM to a file/
330 * @param exportLocation
331 * Filename, URL or Pasted String to write to.
332 * @throws FileNotFoundException
333 * @throws UnsupportedEncodingException
337 public void exportFile(String exportLocation) throws IOException
339 PrintWriter writer = new PrintWriter(exportLocation);
340 appendFileProperties(writer);
342 writer.println("//");
349 * Returns a string to be added to the StringBuilder containing the entire
352 * @param initialColumnSeparation
353 * The initial whitespace separation between the left side of the
354 * file and first character.
355 * @param columnSeparation
356 * The separation between subsequent data entries.
358 * The list fo data to be added to the String.
361 String addData(int initialColumnSeparation,
362 int columnSeparation, List<String> data)
366 for (String value : data)
370 line += String.format("%" + initialColumnSeparation + "s", value);
374 line += String.format("%" + columnSeparation + "s", value);
382 * Converts list of characters into a list of Strings.
385 * @return Returns the list of Strings.
387 List<String> charListToStringList(List<Character> list)
389 List<String> strList = new ArrayList<>();
390 for (char value : list)
392 String strValue = Character.toString(value);
393 strList.add(strValue);
399 * Converts a list of doubles into a list of Strings, rounded to the nearest
403 * @param noOfDecimals
406 List<String> doubleListToStringList(List<Double> list)
408 List<String> strList = new ArrayList<>();
409 for (double value : list)
414 strValue = String.format("%.5f", value);
417 else if (value == -0.00000d)
419 strValue = "0.00000";
426 strList.add(strValue);
432 * Converts a primitive array of Strings to a list of Strings.
437 List<String> stringArrayToStringList(String[] array)
439 List<String> list = new ArrayList<>();
440 for (String value : array)
449 * Appends the hidden Markov model data to the StringBuilder containing the
453 * The StringBuilder containing the output.
455 void appendModel(PrintWriter writer)
457 String symbolLine = "HMM";
458 List<Character> charSymbols = hmm.getSymbols();
459 List<String> strSymbols;
460 strSymbols = charListToStringList(charSymbols);
461 symbolLine += addData(11, 9, strSymbols);
462 writer.println(symbolLine);
463 writer.println(TRANSITIONTYPELINE);
465 int length = hmm.getLength();
467 for (int node = 0; node <= length; node++)
472 matchLine = String.format("%7s", "COMPO");
476 matchLine = String.format("%7s", node);
479 List<String> strMatches;
480 List<Double> doubleMatches;
481 doubleMatches = convertListToLogSpace(
482 hmm.getNode(node).getMatchEmissions());
483 strMatches = doubleListToStringList(doubleMatches);
484 matchLine += addData(10, 9, strMatches);
489 matchLine += SPACE + (hmm.getNodeAlignmentColumn(node) + 1);
490 matchLine += SPACE + hmm.getConsensusResidue(node);
491 matchLine += SPACE + hmm.getReferenceAnnotation(node);
492 if (hmm.getFileHeader().contains("HMMER3/f"))
494 matchLine += SPACE + hmm.getMaskedValue(node);
495 matchLine += SPACE + hmm.getConsensusStructure(node);
500 writer.println(matchLine);
502 String insertLine = EMPTY;
503 List<String> strInserts;
504 List<Double> doubleInserts;
505 doubleInserts = convertListToLogSpace(
506 hmm.getNode(node).getInsertEmissions());
507 strInserts = doubleListToStringList(doubleInserts);
508 insertLine += addData(17, 9, strInserts);
510 writer.println(insertLine);
512 String transitionLine = EMPTY;
513 List<String> strTransitions;
514 List<Double> doubleTransitions;
515 doubleTransitions = convertListToLogSpace(
516 hmm.getNode(node).getStateTransitions());
517 strTransitions = doubleListToStringList(doubleTransitions);
518 transitionLine += addData(17, 9, strTransitions);
520 writer.println(transitionLine);
525 * Appends the hidden Markov model file properties to the StringBuilder
526 * containing the output
529 * The StringBuilder containing the output.
531 void appendFileProperties(PrintWriter writer)
535 writer.println(hmm.getFileHeader());
537 line = String.format("%-5s %1s", "NAME", hmm.getName());
538 writer.println((line));
540 if (hmm.getAccessionNumber() != null)
542 line = String.format("%-5s %1s", "ACC", hmm.getAccessionNumber());
543 writer.println((line));
546 if (hmm.getDescription() != null)
548 line = String.format("%-5s %1s", "DESC", hmm.getDescription());
549 writer.println((line));
551 line = String.format("%-5s %1s", "LENG", hmm.getLength());
552 writer.println((line));
554 if (hmm.getMaxInstanceLength() != null)
556 line = String.format("%-5s %1s", "MAXL", hmm.getMaxInstanceLength());
557 writer.println((line));
559 line = String.format("%-5s %1s", "ALPH", hmm.getAlphabetType());
560 writer.println((line));
565 status = hmm.referenceAnnotationIsActive();
566 statusStr = HiddenMarkovModel.findStringFromBoolean(status);
567 line = String.format("%-5s %1s", "RF",
569 writer.println((line));
571 status = hmm.maskValueIsActive();
572 statusStr = HiddenMarkovModel.findStringFromBoolean(status);
573 line = String.format("%-5s %1s", "MM",
575 writer.println((line));
577 status = hmm.consensusResidueIsActive();
578 statusStr = HiddenMarkovModel.findStringFromBoolean(status);
579 line = String.format("%-5s %1s", "CONS",
581 writer.println((line));
583 status = hmm.consensusStructureIsActive();
584 statusStr = HiddenMarkovModel.findStringFromBoolean(status);
585 line = String.format("%-5s %1s", "CS",
587 writer.println((line));
589 status = hmm.mapIsActive();
590 statusStr = HiddenMarkovModel.findStringFromBoolean(status);
591 line = String.format("%-5s %1s", "MAP",
593 writer.println((line));
596 if (hmm.getDate() != null)
598 line = String.format("%-5s %1s", "DATE", hmm.getDate());
599 writer.println((line));
601 if (hmm.getNumberOfSequences() != null)
603 line = String.format("%-5s %1s", "NSEQ", hmm.getNumberOfSequences());
604 writer.println((line));
606 if (hmm.getEffectiveNumberOfSequences() != null)
608 line = String.format("%-5s %1s", "EFFN",
609 hmm.getEffectiveNumberOfSequences());
610 writer.println((line));
612 if (hmm.getCheckSum() != null)
614 line = String.format("%-5s %1s", "CKSUM", hmm.getCheckSum());
615 writer.println((line));
617 if (hmm.getGatheringThreshold() != null)
619 line = String.format("%-5s %1s", "GA", hmm.getGatheringThreshold());
620 writer.println((line));
623 if (hmm.getTrustedCutoff() != null)
625 line = String.format("%-5s %1s", "TC", hmm.getTrustedCutoff());
626 writer.println((line));
628 if (hmm.getNoiseCutoff() != null)
630 line = String.format("%-5s %1s", "NC", hmm.getNoiseCutoff());
631 writer.println((line));
633 if (hmm.getMSV() != null)
635 line = String.format("%-19s %18s", "STATS LOCAL MSV", hmm.getMSV());
636 writer.println((line));
638 line = String.format("%-19s %18s", "STATS LOCAL VITERBI",
640 writer.println((line));
642 line = String.format("%-19s %18s", "STATS LOCAL FORWARD",
644 writer.println((line));
650 * Returns the char value of a single lettered String.
655 char charValue(String string)
658 character = string.charAt(0);
664 public String print(SequenceI[] seqs, boolean jvsuffix)
671 * Converts the probabilities contained in a list into log space.
675 List<Double> convertListToLogSpace(List<Double> list)
678 List<Double> convertedList = new ArrayList<>();
679 for (int i = 0; i < list.size(); i++)
681 double prob = list.get(i);
682 double logProb = -1 * Math.log(prob);
684 convertedList.add(logProb);
686 return convertedList;