src/jalview/datamodel/HiddenMarkovModel.java

   1 package jalview.datamodel;
   2
   3 import jalview.io.HMMFile;
   4 import jalview.schemes.ResidueProperties;
   5 import jalview.util.Comparison;
   6
   7 import java.util.ArrayList;
   8 import java.util.Arrays;
   9 import java.util.HashMap;
  10 import java.util.List;
  11 import java.util.Map;
  12
  13 /**
  14  * Data structure which stores a hidden Markov model
  15  *
  16  * @author TZVanaalten
  17  *
  18  */
  19 public class HiddenMarkovModel
  20 {
  21   public final static String YES = "yes";
  22
  23   public final static String NO = "no";
  24
  25   public static final int MATCHTOMATCH = 0;
  26
  27   public static final int MATCHTOINSERT = 1;
  28
  29   public static final int MATCHTODELETE = 2;
  30
  31   public static final int INSERTTOMATCH = 3;
  32
  33   public static final int INSERTTOINSERT = 4;
  34
  35   public static final int DELETETOMATCH = 5;
  36
  37   public static final int DELETETODELETE = 6;
  38
  39   private static final double LOG2 = Math.log(2);
  40
  41   /*
  42    * properties read from HMM file header lines
  43    */
  44   Map<String, String> fileProperties = new HashMap<>();
  45
  46   String fileHeader;
  47
  48   /*
  49    * the symbols used in this model e.g. "ACGT"
  50    */
  51   String alphabet;
  52
  53   /*
  54    * symbol lookup index into the alphabet for 'A' to 'Z'
  55    */
  56   int[] symbolIndexLookup = new int['Z' - 'A' + 1];
  57
  58   /*
  59    * Nodes in the model. The begin node is at index 0, and contains
  60    * average emission probabilities for each symbol.
  61    */
  62   List<HMMNode> nodes = new ArrayList<>();
  63
  64   /*
  65    * lookup of the HMM node for each alignment column (from 0)
  66    */
  67   Map<Integer, HMMNode> nodeLookup = new HashMap<>();
  68
  69   /**
  70    * Constructor
  71    */
  72   public HiddenMarkovModel()
  73   {
  74   }
  75
  76   public HiddenMarkovModel(HiddenMarkovModel hmm)
  77   {
  78     super();
  79     this.fileProperties = new HashMap<>(hmm.fileProperties);
  80     this.alphabet = hmm.alphabet;
  81     this.nodes = new ArrayList<>(hmm.nodes);
  82     this.nodeLookup = new HashMap<>(hmm.nodeLookup);
  83     this.symbolIndexLookup = hmm.symbolIndexLookup;
  84     this.fileHeader = new String(hmm.fileHeader);
  85   }
  86
  87   /**
  88    * Returns the information content at a specified column, calculated as the
  89    * sum (over possible symbols) of the log ratio
  90    *
  91    * <pre>
  92    *  log(emission probability / background probability) / log(2)
  93    * </pre>
  94    *
  95    * @param column
  96    *          column position (base 0)
  97    * @return
  98    */
  99   public float getInformationContent(int column)
 100   {
 101     float informationContent = 0f;
 102
 103     for (char symbol : getSymbols().toCharArray())
 104     {
 105       float freq = ResidueProperties.backgroundFrequencies
 106               .get(getAlphabetType()).get(symbol);
 107       float prob = (float) getMatchEmissionProbability(column, symbol);
 108       informationContent += prob * Math.log(prob / freq);
 109     }
 110
 111     informationContent = informationContent / (float) LOG2;
 112
 113     return informationContent;
 114   }
 115
 116   /**
 117    * Gets the file header of the .hmm file this model came from
 118    *
 119    * @return
 120    */
 121   public String getFileHeader()
 122   {
 123     return fileHeader;
 124   }
 125
 126   /**
 127    * Sets the file header of this model.
 128    *
 129    * @param header
 130    */
 131   public void setFileHeader(String header)
 132   {
 133     fileHeader = header;
 134   }
 135
 136   /**
 137    * Returns the symbols used in this hidden Markov model
 138    *
 139    * @return
 140    */
 141   public String getSymbols()
 142   {
 143     return alphabet;
 144   }
 145
 146   /**
 147    * Gets the node in the hidden Markov model at the specified position.
 148    *
 149    * @param nodeIndex
 150    *          The index of the node requested. Node 0 optionally contains the
 151    *          average match emission probabilities across the entire model, and
 152    *          always contains the insert emission probabilities and state
 153    *          transition probabilities for the begin node. Node 1 contains the
 154    *          first node in the HMM that can correspond to a column in the
 155    *          alignment.
 156    * @return
 157    */
 158   public HMMNode getNode(int nodeIndex)
 159   {
 160     return nodes.get(nodeIndex);
 161   }
 162
 163   /**
 164    * Returns the name of the sequence alignment on which the HMM is based.
 165    *
 166    * @return
 167    */
 168   public String getName()
 169   {
 170     return fileProperties.get(HMMFile.NAME);
 171   }
 172
 173   /**
 174    * Answers the string value of the property (parsed from an HMM file) for the
 175    * given key, or null if the property is not present
 176    *
 177    * @param key
 178    * @return
 179    */
 180   public String getProperty(String key)
 181   {
 182     return fileProperties.get(key);
 183   }
 184
 185   /**
 186    * Answers true if the property with the given key is present with a value of
 187    * "yes" (not case-sensitive), else false
 188    *
 189    * @param key
 190    * @return
 191    */
 192   public boolean getBooleanProperty(String key)
 193   {
 194     return YES.equalsIgnoreCase(fileProperties.get(key));
 195   }
 196
 197   /**
 198    * Returns the length of the hidden Markov model, or 0 if not known
 199    *
 200    * @return
 201    */
 202   public int getLength()
 203   {
 204     if (fileProperties.get(HMMFile.LENGTH) == null)
 205     {
 206       return 0;
 207     }
 208     return Integer.parseInt(fileProperties.get(HMMFile.LENGTH));
 209   }
 210
 211   /**
 212    * Returns the value of mandatory property "ALPH" - "amino", "DNA", "RNA" are
 213    * the options. Other alphabets may be added.
 214    *
 215    * @return
 216    */
 217   public String getAlphabetType()
 218   {
 219     return fileProperties.get(HMMFile.ALPHABET);
 220   }
 221
 222   /**
 223    * Sets the model alphabet to the symbols in the given string (ignoring any
 224    * whitespace), and returns the number of symbols
 225    *
 226    * @param symbols
 227    */
 228   public int setAlphabet(String symbols)
 229   {
 230     String trimmed = symbols.toUpperCase().replaceAll("\\s", "");
 231     int count = trimmed.length();
 232     alphabet = trimmed;
 233     symbolIndexLookup = new int['Z' - 'A' + 1];
 234     Arrays.fill(symbolIndexLookup, -1);
 235     int ignored = 0;
 236
 237     /*
 238      * save the symbols in order, and a quick lookup of symbol position
 239      */
 240     for (short i = 0; i < count; i++)
 241     {
 242       char symbol = trimmed.charAt(i);
 243       if (symbol >= 'A' && symbol <= 'Z'
 244               && symbolIndexLookup[symbol - 'A'] == -1)
 245       {
 246         symbolIndexLookup[symbol - 'A'] = i;
 247       }
 248       else
 249       {
 250         System.err
 251                 .println(
 252                         "Unexpected or duplicated character in HMM ALPHabet: "
 253                                 + symbol);
 254         ignored++;
 255       }
 256     }
 257     return count - ignored;
 258   }
 259
 260   /**
 261    * Sets the list of nodes in this HMM to the given list.
 262    *
 263    * @param nodes
 264    *          The list of nodes to which the current list of nodes is being
 265    *          changed.
 266    */
 267   public void setNodes(List<HMMNode> nodes)
 268   {
 269     this.nodes = nodes;
 270   }
 271
 272   /**
 273    * Gets the match emission probability for a given symbol at a column in the
 274    * alignment.
 275    *
 276    * @param alignColumn
 277    *          The index of the alignment column, starting at index 0. Index 0
 278    *          usually corresponds to index 1 in the HMM.
 279    * @param symbol
 280    *          The symbol for which the desired probability is being requested.
 281    * @return
 282    *
 283    */
 284   public double getMatchEmissionProbability(int alignColumn, char symbol)
 285   {
 286     int symbolIndex = getSymbolIndex(symbol);
 287     double probability = 0d;
 288     if (symbolIndex != -1 && nodeLookup.containsKey(alignColumn))
 289     {
 290       HMMNode node = nodeLookup.get(alignColumn);
 291       probability = node.getMatchEmission(symbolIndex);
 292     }
 293     return probability;
 294   }
 295
 296   /**
 297    * Gets the insert emission probability for a given symbol at a column in the
 298    * alignment.
 299    *
 300    * @param alignColumn
 301    *          The index of the alignment column, starting at index 0. Index 0
 302    *          usually corresponds to index 1 in the HMM.
 303    * @param symbol
 304    *          The symbol for which the desired probability is being requested.
 305    * @return
 306    *
 307    */
 308   public double getInsertEmissionProbability(int alignColumn, char symbol)
 309   {
 310     int symbolIndex = getSymbolIndex(symbol);
 311     double probability = 0d;
 312     if (symbolIndex != -1 && nodeLookup.containsKey(alignColumn))
 313     {
 314       HMMNode node = nodeLookup.get(alignColumn);
 315       probability = node.getInsertEmission(symbolIndex);
 316     }
 317     return probability;
 318   }
 319
 320   /**
 321    * Gets the state transition probability for a given symbol at a column in the
 322    * alignment.
 323    *
 324    * @param alignColumn
 325    *          The index of the alignment column, starting at index 0. Index 0
 326    *          usually corresponds to index 1 in the HMM.
 327    * @param symbol
 328    *          The symbol for which the desired probability is being requested.
 329    * @return
 330    *
 331    */
 332   public Double getStateTransitionProbability(int alignColumn,
 333           int transition)
 334   {
 335     double probability = 0d;
 336     if (nodeLookup.containsKey(alignColumn))
 337     {
 338       HMMNode node = nodeLookup.get(alignColumn);
 339       probability = node.getStateTransition(transition);
 340     }
 341     return probability;
 342   }
 343
 344   /**
 345    * Returns the alignment column linked to the node at the given index.
 346    *
 347    * @param nodeIndex
 348    *          The index of the node, starting from index 1. Index 0 is the begin
 349    *          node, which does not correspond to a column in the alignment.
 350    * @return
 351    */
 352   public Integer getNodeAlignmentColumn(int nodeIndex)
 353   {
 354     Integer value = nodes.get(nodeIndex).getAlignmentColumn();
 355     return value;
 356   }
 357
 358   /**
 359    * Returns the consensus residue at the specified node.
 360    *
 361    * @param nodeIndex
 362    *          The index of the specified node.
 363    * @return
 364    */
 365   public char getConsensusResidue(int nodeIndex)
 366   {
 367    char value = nodes.get(nodeIndex).getConsensusResidue();
 368    return value;
 369   }
 370
 371   /**
 372    * Returns the consensus at a given alignment column. If the character is
 373    * lower case, its emission probability is less than 0.5.
 374    *
 375    * @param columnIndex
 376    *          The index of the column in the alignment for which the consensus
 377    *          is desired. The list of columns starts at index 0.
 378    * @return
 379    */
 380   public char getConsensusAtAlignColumn(int columnIndex)
 381   {
 382     char mostLikely = '-';
 383     if (getBooleanProperty(HMMFile.CONSENSUS_RESIDUE))
 384     {
 385       HMMNode node = nodeLookup.get(columnIndex);
 386       if (node == null)
 387       {
 388         return '-';
 389       }
 390       mostLikely = node.getConsensusResidue();
 391       return mostLikely;
 392     }
 393     else
 394     {
 395       double highestProb = 0;
 396       for (char character : alphabet.toCharArray())
 397       {
 398         double prob = getMatchEmissionProbability(columnIndex, character);
 399         if (prob > highestProb)
 400         {
 401           highestProb = prob;
 402           mostLikely = character;
 403         }
 404       }
 405       if (highestProb < 0.5)
 406       {
 407         mostLikely = Character.toLowerCase(mostLikely);
 408       }
 409       return mostLikely;
 410     }
 411
 412   }
 413
 414   /**
 415    * Returns the reference annotation at the specified node.
 416    *
 417    * @param nodeIndex
 418    *          The index of the specified node.
 419    * @return
 420    */
 421   public char getReferenceAnnotation(int nodeIndex)
 422   {
 423    char value = nodes.get(nodeIndex).getReferenceAnnotation();
 424    return value;
 425   }
 426
 427   /**
 428    * Returns the mask value at the specified node.
 429    *
 430    * @param nodeIndex
 431    *          The index of the specified node.
 432    * @return
 433    */
 434   public char getMaskedValue(int nodeIndex)
 435   {
 436    char value = nodes.get(nodeIndex).getMaskValue();
 437    return value;
 438   }
 439
 440   /**
 441    * Returns the consensus structure at the specified node.
 442    *
 443    * @param nodeIndex
 444    *          The index of the specified node.
 445    * @return
 446    */
 447   public char getConsensusStructure(int nodeIndex)
 448   {
 449    char value = nodes.get(nodeIndex).getConsensusStructure();
 450    return value;
 451   }
 452
 453   /**
 454    * Returns the number of symbols in the alphabet used in this HMM.
 455    *
 456    * @return
 457    */
 458   public int getNumberOfSymbols()
 459   {
 460     return alphabet.length();
 461   }
 462
 463   /**
 464    * Sets a property read from an HMM file
 465    *
 466    * @param key
 467    * @param value
 468    */
 469   public void setProperty(String key, String value)
 470   {
 471     fileProperties.put(key, value);
 472   }
 473
 474   /**
 475    * Sets the alignment column of the specified node
 476    *
 477    * @param nodeIndex
 478    *
 479    * @param column
 480    *
 481    */
 482   public void setAlignmentColumn(HMMNode node, int column)
 483   {
 484     node.setAlignmentColumn(column);
 485     nodeLookup.put(column, node);
 486   }
 487
 488   public void updateMapping(char[] sequence)
 489   {
 490     int nodeNo = 1;
 491     int column = 0;
 492     synchronized (nodeLookup)
 493     {
 494       clearNodeLookup();
 495       for (char residue : sequence)
 496       {
 497         if (!Comparison.isGap(residue))
 498         {
 499           HMMNode node = nodes.get(nodeNo);
 500           if (node == null)
 501           {
 502             // error : too few nodes for sequence
 503             break;
 504           }
 505           setAlignmentColumn(node, column);
 506           nodeNo++;
 507         }
 508         column++;
 509       }
 510     }
 511   }
 512
 513   /**
 514    * Clears all data in the node lookup map
 515    */
 516   public void clearNodeLookup()
 517   {
 518     nodeLookup.clear();
 519   }
 520
 521   /**
 522    * Sets the reference annotation at a given node
 523    *
 524    * @param nodeIndex
 525    * @param value
 526    */
 527   public void setReferenceAnnotation(int nodeIndex, char value)
 528   {
 529     nodes.get(nodeIndex).setReferenceAnnotation(value);
 530   }
 531
 532   /**
 533    * Sets the consensus residue at a given node
 534    *
 535    * @param nodeIndex
 536    * @param value
 537    */
 538   public void setConsensusResidue(int nodeIndex, char value)
 539   {
 540     nodes.get(nodeIndex).setConsensusResidue(value);
 541   }
 542
 543   /**
 544    * Sets the consensus structure at a given node
 545    *
 546    * @param nodeIndex
 547    * @param value
 548    */
 549   public void setConsensusStructure(int nodeIndex, char value)
 550   {
 551     nodes.get(nodeIndex).setConsensusStructure(value);
 552   }
 553
 554   /**
 555    * Sets the mask value at a given node
 556    *
 557    * @param nodeIndex
 558    * @param value
 559    */
 560   public void setMaskValue(int nodeIndex, char value)
 561   {
 562     nodes.get(nodeIndex).setMaskValue(value);
 563   }
 564
 565   /**
 566    * Temporary implementation, should not be used.
 567    *
 568    * @return
 569    */
 570   public String getViterbi()
 571   {
 572     String value;
 573     value = fileProperties.get(HMMFile.VITERBI);
 574     return value;
 575   }
 576
 577   /**
 578    * Temporary implementation, should not be used.
 579    *
 580    * @return
 581    */
 582   public String getMSV()
 583   {
 584     String value;
 585     value = fileProperties.get(HMMFile.MSV);
 586     return value;
 587   }
 588
 589   /**
 590    * Temporary implementation, should not be used.
 591    *
 592    * @return
 593    */
 594   public String getForward()
 595   {
 596     String value;
 597     value = fileProperties.get(HMMFile.FORWARD);
 598     return value;
 599   }
 600
 601   /**
 602    * Answers the HMMNode mapped to the given alignment column (base 0), or null
 603    * if none is mapped
 604    *
 605    * @param alignmentColumn
 606    */
 607   public HMMNode getNodeForColumn(int alignmentColumn)
 608   {
 609     return nodeLookup.get(alignmentColumn);
 610   }
 611
 612   /**
 613    * Returns the consensus sequence based on the most probable symbol at each
 614    * position. The sequence is adjusted to match the length of the existing
 615    * sequence alignment. Gap characters are used as padding.
 616    *
 617    * @return
 618    */
 619   public Sequence getConsensusSequence()
 620   {
 621     int start;
 622     int end;
 623     int modelLength;
 624     start = getNodeAlignmentColumn(1);
 625     modelLength = getLength();
 626     end = getNodeAlignmentColumn(modelLength);
 627     char[] sequence = new char[end + 1];
 628     for (int index = 0; index < end + 1; index++)
 629     {
 630       Character character;
 631
 632         character = getConsensusAtAlignColumn(index);
 633
 634       if (character == null || character == '-')
 635       {
 636         sequence[index] = '-';
 637       }
 638       else
 639       {
 640         sequence[index] = Character.toUpperCase(character);
 641       }
 642       }
 643
 644
 645     Sequence seq = new Sequence(getName(), sequence, start,
 646             end);
 647     return seq;
 648   }
 649
 650
 651   /**
 652    * Initiates a HMM consensus sequence
 653    *
 654    * @return A new HMM consensus sequence
 655    */
 656   public SequenceI initHMMSequence()
 657   {
 658     Sequence consensus = getConsensusSequence();
 659     consensus.setIsHMMConsensusSequence(true);
 660     consensus.setHMM(this);
 661     return consensus;
 662   }
 663
 664   /**
 665    * Answers the index position (0...) of the given symbol, or -1 if not a valid
 666    * symbol for this HMM
 667    *
 668    * @param symbol
 669    * @return
 670    */
 671   public int getSymbolIndex(char symbol)
 672   {
 673     /*
 674      * symbolIndexLookup holds the index for 'A' to 'Z'
 675      */
 676     char c = Character.toUpperCase(symbol);
 677     if ('A' <= c && c <= 'Z')
 678     {
 679       return symbolIndexLookup[c - 'A'];
 680     }
 681     return -1;
 682   }
 683
 684   public void addNode(HMMNode node)
 685   {
 686     nodes.add(node);
 687   }
 688 }
 689