src/jalview/datamodel/HiddenMarkovModel.java

   1 package jalview.datamodel;
   2
   3 import jalview.io.HMMFile;
   4 import jalview.schemes.ResidueProperties;
   5 import jalview.util.Comparison;
   6 import jalview.util.MapList;
   7
   8 import java.util.ArrayList;
   9 import java.util.Arrays;
  10 import java.util.HashMap;
  11 import java.util.List;
  12 import java.util.Map;
  13
  14 /**
  15  * Data structure which stores a hidden Markov model
  16  *
  17  * @author TZVanaalten
  18  *
  19  */
  20 public class HiddenMarkovModel
  21 {
  22   private static final char GAP_DASH = '-';
  23
  24   public final static String YES = "yes";
  25
  26   public final static String NO = "no";
  27
  28   public static final int MATCHTOMATCH = 0;
  29
  30   public static final int MATCHTOINSERT = 1;
  31
  32   public static final int MATCHTODELETE = 2;
  33
  34   public static final int INSERTTOMATCH = 3;
  35
  36   public static final int INSERTTOINSERT = 4;
  37
  38   public static final int DELETETOMATCH = 5;
  39
  40   public static final int DELETETODELETE = 6;
  41
  42   private static final double LOG2 = Math.log(2);
  43
  44   /*
  45    * properties read from HMM file header lines
  46    */
  47   private Map<String, String> fileProperties = new HashMap<>();
  48
  49   private String fileHeader;
  50
  51   /*
  52    * the symbols used in this model e.g. "ACGT"
  53    */
  54   private String alphabet;
  55
  56   /*
  57    * symbol lookup index into the alphabet for 'A' to 'Z'
  58    */
  59   private int[] symbolIndexLookup = new int['Z' - 'A' + 1];
  60
  61   /*
  62    * Nodes in the model. The begin node is at index 0, and contains
  63    * average emission probabilities for each symbol.
  64    */
  65   private List<HMMNode> nodes = new ArrayList<>();
  66
  67   /*
  68    * the aligned HMM consensus sequence extracted from the HMM profile
  69    */
  70   private SequenceI hmmSeq;
  71
  72   /*
  73    * mapping from HMM nodes to residues of the hmm consensus sequence
  74    */
  75   private Mapping mapToHmmConsensus;
  76
  77   /**
  78    * Constructor
  79    */
  80   public HiddenMarkovModel()
  81   {
  82   }
  83
  84   /**
  85    * Copy constructor
  86    *
  87    * @param hmm
  88    */
  89   public HiddenMarkovModel(HiddenMarkovModel hmm)
  90   {
  91     super();
  92     this.fileProperties = new HashMap<>(hmm.fileProperties);
  93     this.alphabet = hmm.alphabet;
  94     this.nodes = new ArrayList<>(hmm.nodes);
  95     this.symbolIndexLookup = hmm.symbolIndexLookup;
  96     this.fileHeader = new String(hmm.fileHeader);
  97   }
  98
  99   /**
 100    * Returns the information content at a specified column, calculated as the
 101    * sum (over possible symbols) of the log ratio
 102    *
 103    * <pre>
 104    *  log(emission probability / background probability) / log(2)
 105    * </pre>
 106    *
 107    * @param column
 108    *          column position (base 0)
 109    * @return
 110    */
 111   public float getInformationContent(int column)
 112   {
 113     float informationContent = 0f;
 114
 115     for (char symbol : getSymbols().toCharArray())
 116     {
 117       float freq = ResidueProperties.backgroundFrequencies
 118               .get(getAlphabetType()).get(symbol);
 119       float prob = (float) getMatchEmissionProbability(column, symbol);
 120       informationContent += prob * Math.log(prob / freq);
 121     }
 122
 123     informationContent = informationContent / (float) LOG2;
 124
 125     return informationContent;
 126   }
 127
 128   /**
 129    * Gets the file header of the .hmm file this model came from
 130    *
 131    * @return
 132    */
 133   public String getFileHeader()
 134   {
 135     return fileHeader;
 136   }
 137
 138   /**
 139    * Sets the file header of this model.
 140    *
 141    * @param header
 142    */
 143   public void setFileHeader(String header)
 144   {
 145     fileHeader = header;
 146   }
 147
 148   /**
 149    * Returns the symbols used in this hidden Markov model
 150    *
 151    * @return
 152    */
 153   public String getSymbols()
 154   {
 155     return alphabet;
 156   }
 157
 158   /**
 159    * Gets the node in the hidden Markov model at the specified position.
 160    *
 161    * @param nodeIndex
 162    *          The index of the node requested. Node 0 optionally contains the
 163    *          average match emission probabilities across the entire model, and
 164    *          always contains the insert emission probabilities and state
 165    *          transition probabilities for the begin node. Node 1 contains the
 166    *          first node in the HMM that can correspond to a column in the
 167    *          alignment.
 168    * @return
 169    */
 170   public HMMNode getNode(int nodeIndex)
 171   {
 172     return nodes.get(nodeIndex);
 173   }
 174
 175   /**
 176    * Returns the name of the sequence alignment on which the HMM is based.
 177    *
 178    * @return
 179    */
 180   public String getName()
 181   {
 182     return fileProperties.get(HMMFile.NAME);
 183   }
 184
 185   /**
 186    * Answers the string value of the property (parsed from an HMM file) for the
 187    * given key, or null if the property is not present
 188    *
 189    * @param key
 190    * @return
 191    */
 192   public String getProperty(String key)
 193   {
 194     return fileProperties.get(key);
 195   }
 196
 197   /**
 198    * Answers true if the property with the given key is present with a value of
 199    * "yes" (not case-sensitive), else false
 200    *
 201    * @param key
 202    * @return
 203    */
 204   public boolean getBooleanProperty(String key)
 205   {
 206     return YES.equalsIgnoreCase(fileProperties.get(key));
 207   }
 208
 209   /**
 210    * Returns the length of the hidden Markov model. The value returned is the
 211    * LENG property if specified, else the number of nodes, excluding the begin
 212    * node (which should be the same thing).
 213    *
 214    * @return
 215    */
 216   public int getLength()
 217   {
 218     if (fileProperties.get(HMMFile.LENGTH) == null)
 219     {
 220       return nodes.size() - 1; // not counting BEGIN node
 221     }
 222     return Integer.parseInt(fileProperties.get(HMMFile.LENGTH));
 223   }
 224
 225   /**
 226    * Returns the value of mandatory property "ALPH" - "amino", "DNA", "RNA" are
 227    * the options. Other alphabets may be added.
 228    *
 229    * @return
 230    */
 231   public String getAlphabetType()
 232   {
 233     return fileProperties.get(HMMFile.ALPHABET);
 234   }
 235
 236   /**
 237    * Sets the model alphabet to the symbols in the given string (ignoring any
 238    * whitespace), and returns the number of symbols
 239    *
 240    * @param symbols
 241    */
 242   public int setAlphabet(String symbols)
 243   {
 244     String trimmed = symbols.toUpperCase().replaceAll("\\s", "");
 245     int count = trimmed.length();
 246     alphabet = trimmed;
 247     symbolIndexLookup = new int['Z' - 'A' + 1];
 248     Arrays.fill(symbolIndexLookup, -1);
 249     int ignored = 0;
 250
 251     /*
 252      * save the symbols in order, and a quick lookup of symbol position
 253      */
 254     for (short i = 0; i < count; i++)
 255     {
 256       char symbol = trimmed.charAt(i);
 257       if (symbol >= 'A' && symbol <= 'Z'
 258               && symbolIndexLookup[symbol - 'A'] == -1)
 259       {
 260         symbolIndexLookup[symbol - 'A'] = i;
 261       }
 262       else
 263       {
 264         System.err
 265                 .println(
 266                         "Unexpected or duplicated character in HMM ALPHabet: "
 267                                 + symbol);
 268         ignored++;
 269       }
 270     }
 271     return count - ignored;
 272   }
 273
 274   /**
 275    * Answers the node of the model corresponding to an aligned column position
 276    * (0...), or null if there is no such node
 277    *
 278    * @param column
 279    * @return
 280    */
 281   HMMNode getNodeForColumn(int column)
 282   {
 283     /*
 284      * if the hmm consensus is gapped at the column,
 285      * there is no corresponding node
 286      */
 287     if (Comparison.isGap(hmmSeq.getCharAt(column)))
 288     {
 289       return null;
 290     }
 291
 292     /*
 293      * find the node (if any) that is mapped to the
 294      * consensus sequence residue position at the column
 295      */
 296     int seqPos = hmmSeq.findPosition(column);
 297     int[] nodeNo = mapToHmmConsensus.getMap().locateInFrom(seqPos, seqPos);
 298     if (nodeNo != null)
 299     {
 300       return getNode(nodeNo[0]);
 301     }
 302     return null;
 303   }
 304
 305   /**
 306    * Gets the match emission probability for a given symbol at a column in the
 307    * alignment.
 308    *
 309    * @param alignColumn
 310    *          The index of the alignment column, starting at index 0. Index 0
 311    *          usually corresponds to index 1 in the HMM.
 312    * @param symbol
 313    *          The symbol for which the desired probability is being requested.
 314    * @return
 315    *
 316    */
 317   public double getMatchEmissionProbability(int alignColumn, char symbol)
 318   {
 319     HMMNode node = getNodeForColumn(alignColumn);
 320     int symbolIndex = getSymbolIndex(symbol);
 321     if (node != null && symbolIndex != -1)
 322     {
 323       return node.getMatchEmission(symbolIndex);
 324     }
 325     return 0D;
 326   }
 327
 328   /**
 329    * Gets the insert emission probability for a given symbol at a column in the
 330    * alignment.
 331    *
 332    * @param alignColumn
 333    *          The index of the alignment column, starting at index 0. Index 0
 334    *          usually corresponds to index 1 in the HMM.
 335    * @param symbol
 336    *          The symbol for which the desired probability is being requested.
 337    * @return
 338    *
 339    */
 340   public double getInsertEmissionProbability(int alignColumn, char symbol)
 341   {
 342     HMMNode node = getNodeForColumn(alignColumn);
 343     int symbolIndex = getSymbolIndex(symbol);
 344     if (node != null && symbolIndex != -1)
 345     {
 346       return node.getInsertEmission(symbolIndex);
 347     }
 348     return 0D;
 349   }
 350
 351   /**
 352    * Gets the state transition probability for a given symbol at a column in the
 353    * alignment.
 354    *
 355    * @param alignColumn
 356    *          The index of the alignment column, starting at index 0. Index 0
 357    *          usually corresponds to index 1 in the HMM.
 358    * @param symbol
 359    *          The symbol for which the desired probability is being requested.
 360    * @return
 361    *
 362    */
 363   public double getStateTransitionProbability(int alignColumn,
 364           int transition)
 365   {
 366     HMMNode node = getNodeForColumn(alignColumn);
 367     if (node != null)
 368     {
 369       return node.getStateTransition(transition);
 370     }
 371     return 0D;
 372   }
 373
 374   /**
 375    * Returns the sequence position linked to the node at the given index. This
 376    * corresponds to an aligned column position (counting from 1).
 377    *
 378    * @param nodeIndex
 379    *          The index of the node, starting from index 1. Index 0 is the begin
 380    *          node, which does not correspond to a column in the alignment.
 381    * @return
 382    */
 383   public int getNodeMapPosition(int nodeIndex)
 384   {
 385     return nodes.get(nodeIndex).getResidueNumber();
 386   }
 387
 388   /**
 389    * Returns the consensus residue at the specified node.
 390    *
 391    * @param nodeIndex
 392    *          The index of the specified node.
 393    * @return
 394    */
 395   public char getConsensusResidue(int nodeIndex)
 396   {
 397    char value = nodes.get(nodeIndex).getConsensusResidue();
 398    return value;
 399   }
 400
 401   /**
 402    * Returns the reference annotation at the specified node.
 403    *
 404    * @param nodeIndex
 405    *          The index of the specified node.
 406    * @return
 407    */
 408   public char getReferenceAnnotation(int nodeIndex)
 409   {
 410    char value = nodes.get(nodeIndex).getReferenceAnnotation();
 411    return value;
 412   }
 413
 414   /**
 415    * Returns the mask value at the specified node.
 416    *
 417    * @param nodeIndex
 418    *          The index of the specified node.
 419    * @return
 420    */
 421   public char getMaskedValue(int nodeIndex)
 422   {
 423    char value = nodes.get(nodeIndex).getMaskValue();
 424    return value;
 425   }
 426
 427   /**
 428    * Returns the consensus structure at the specified node.
 429    *
 430    * @param nodeIndex
 431    *          The index of the specified node.
 432    * @return
 433    */
 434   public char getConsensusStructure(int nodeIndex)
 435   {
 436    char value = nodes.get(nodeIndex).getConsensusStructure();
 437    return value;
 438   }
 439
 440   /**
 441    * Sets a property read from an HMM file
 442    *
 443    * @param key
 444    * @param value
 445    */
 446   public void setProperty(String key, String value)
 447   {
 448     fileProperties.put(key, value);
 449   }
 450
 451   /**
 452    * Temporary implementation, should not be used.
 453    *
 454    * @return
 455    */
 456   public String getViterbi()
 457   {
 458     String value;
 459     value = fileProperties.get(HMMFile.VITERBI);
 460     return value;
 461   }
 462
 463   /**
 464    * Temporary implementation, should not be used.
 465    *
 466    * @return
 467    */
 468   public String getMSV()
 469   {
 470     String value;
 471     value = fileProperties.get(HMMFile.MSV);
 472     return value;
 473   }
 474
 475   /**
 476    * Temporary implementation, should not be used.
 477    *
 478    * @return
 479    */
 480   public String getForward()
 481   {
 482     String value;
 483     value = fileProperties.get(HMMFile.FORWARD);
 484     return value;
 485   }
 486
 487   /**
 488    * Constructs the consensus sequence based on the most probable symbol at each
 489    * position. Gap characters are inserted for discontinuities in the node map
 490    * numbering (if provided), else an ungapped sequence is generated.
 491    * <p>
 492    * A mapping between the HMM nodes and residue positions of the sequence is
 493    * also built and saved.
 494    *
 495    * @return
 496    */
 497   void buildConsensusSequence()
 498   {
 499     List<int[]> toResidues = new ArrayList<>();
 500
 501     /*
 502      * if the HMM provided a map to sequence, use those start/end values,
 503      * else just treat it as for a contiguous sequence numbered from 1
 504      */
 505     boolean hasMap = getBooleanProperty(HMMFile.MAP);
 506     int start = hasMap ? getNode(1).getResidueNumber() : 1;
 507     int endResNo = hasMap ? getNode(nodes.size() - 1).getResidueNumber()
 508             : (start + getLength() - 1);
 509     char[] sequence = new char[endResNo - start + 1];
 510
 511     int lastResNo = start - 1;
 512     int seqOffset = 0;
 513     int gapCount = 0;
 514
 515     for (int nodeNo = 1; nodeNo < nodes.size(); nodeNo++)
 516     {
 517       HMMNode node = nodes.get(nodeNo);
 518       final int resNo = hasMap ? node.getResidueNumber() : lastResNo + 1;
 519
 520       /*
 521        * insert gaps if map numbering is not continuous
 522        */
 523       while (resNo > lastResNo + 1)
 524       {
 525         sequence[seqOffset++] = '-';
 526         lastResNo++;
 527         gapCount++;
 528       }
 529       char consensusResidue = node.getConsensusResidue();
 530       if (GAP_DASH == consensusResidue)
 531       {
 532         /*
 533          * no residue annotation in HMM - scan for the symbol
 534          * with the highest match emission probability
 535          */
 536         int symbolIndex = node.getMaxMatchEmissionIndex();
 537         consensusResidue = alphabet.charAt(symbolIndex);
 538         if (node.getMatchEmission(symbolIndex) < 0.5D)
 539         {
 540           // follow convention of lower case if match emission prob < 0.5
 541           consensusResidue = Character.toLowerCase(consensusResidue);
 542         }
 543       }
 544       sequence[seqOffset++] = consensusResidue;
 545       lastResNo = resNo;
 546     }
 547
 548     Sequence seq = new Sequence(getName(), sequence, start,
 549             lastResNo - gapCount);
 550     seq.createDatasetSequence();
 551     seq.setHMM(this);
 552     seq.setIsHMMConsensusSequence(true);
 553
 554     this.hmmSeq = seq;
 555
 556     /*
 557      * construct and store Mapping of nodes to residues
 558      * note as constructed this is just an identity mapping,
 559      * but it allows for greater flexibility in future
 560      */
 561     List<int[]> fromNodes = new ArrayList<>();
 562     fromNodes.add(new int[] { 1, getLength() });
 563     toResidues.add(new int[] { seq.getStart(), seq.getEnd() });
 564     MapList mapList = new MapList(fromNodes, toResidues, 1, 1);
 565     mapToHmmConsensus = new Mapping(seq.getDatasetSequence(), mapList);
 566   }
 567
 568
 569   /**
 570    * Answers the aligned consensus sequence for the profile. Note this will
 571    * return null if called before <code>setNodes</code> has been called.
 572    *
 573    * @return
 574    */
 575   public SequenceI getConsensusSequence()
 576   {
 577     return hmmSeq;
 578   }
 579
 580   /**
 581    * Answers the index position (0...) of the given symbol, or -1 if not a valid
 582    * symbol for this HMM
 583    *
 584    * @param symbol
 585    * @return
 586    */
 587   private int getSymbolIndex(char symbol)
 588   {
 589     /*
 590      * symbolIndexLookup holds the index for 'A' to 'Z'
 591      */
 592     char c = Character.toUpperCase(symbol);
 593     if ('A' <= c && c <= 'Z')
 594     {
 595       return symbolIndexLookup[c - 'A'];
 596     }
 597     return -1;
 598   }
 599
 600   /**
 601    * Sets the nodes of this HMM, and also extracts the HMM consensus sequence
 602    * and a mapping between node numbers and sequence positions
 603    *
 604    * @param nodeList
 605    */
 606   public void setNodes(List<HMMNode> nodeList)
 607   {
 608     nodes = nodeList;
 609     if (nodes.size() > 1)
 610     {
 611       buildConsensusSequence();
 612     }
 613   }
 614 }
 615