binaries/src/clustalo/src/hhalign/hhhmm.h

   1 /* -*- mode: c; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
   2
   3 /*********************************************************************
   4  * Clustal Omega - Multiple sequence alignment
   5  *
   6  * Copyright (C) 2010 University College Dublin
   7  *
   8  * Clustal-Omega is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU General Public License as
  10  * published by the Free Software Foundation; either version 2 of the
  11  * License, or (at your option) any later version.
  12  *
  13  * This file is part of Clustal-Omega.
  14  *
  15  ********************************************************************/
  16
  17 /*
  18  * RCS $Id: hhhmm.h 165 2010-12-22 16:24:48Z fabian $
  19  */
  20
  21 // hhhmm.h
  22
  23
  24 class HMM
  25 {
  26  public:
  27   HMM(int maxseqdis=MAXSEQDIS, int maxres=/*MAXRES*/par.maxResLen);
  28   ~HMM();
  29   HMM& operator=(HMM&);
  30
  31   int n_display;            // number of sequences stored for display of alignment (INCLUDING >ss_ and >cf_ sequences)
  32   int n_seqs;               // number of sequences read in (INCLUDING >ss_ and >cf_ sequences)
  33   char** sname;             // names of stored sequences
  34   char** seq;               // residues of stored sequences (first at pos 1!)
  35   int ncons;                // index of consensus sequence
  36   int nfirst;               // index of first sequence (query sequence of HMM)
  37   int nss_dssp;             // index of seq[] with secondary structure by dssp
  38   int nsa_dssp;             // index of seq[] with solvent accessibility by dssp
  39   int nss_pred;             // index of seq[] with predicted secondary structure
  40   int nss_conf;             // index of seq[] with confidence values for secondary structure prediction
  41
  42   int L;                    // length of HMM = number of match states; set in declaration of HMM object
  43   int N_in;                 // number of sequences in alignment
  44   int N_filtered;           // number of sequences after filtering
  45   float* Neff_M;            // Neff_M[i] = diversity of subalignment of seqs that have residue in col i
  46   float* Neff_I;            // Neff_I[i] = diversity of subalignment of seqs that have insert in col i
  47   float* Neff_D;            // Neff_D[i] = diversity of subalignment of seqs that have delete in col i
  48   float Neff_HMM;           // average number of Neff over total length of HMM
  49
  50   char* longname;           // Full name of first sequence of original alignment (NAME field)
  51   char name[NAMELEN];       // HMM name = first word in longname in lower case
  52   char file[NAMELEN];       // Basename (with path, without extension) of alignment file that was used to construct the HMM
  53   char fam[NAMELEN];        // family ID (derived from name) (FAM field)
  54   char sfam[NAMELEN];       // superfamily ID (derived from name)
  55   char fold[NAMELEN];       // fold ID (derived from name)
  56   char cl[NAMELEN];         // class ID (derived from name)
  57
  58   float lamda, mu;          // coefficients for aa score distribution of HMM using parameters in 'Parameters par'
  59   bool has_pseudocounts;    // set to true if HMM contains pseudocounts
  60
  61   // Make a flat copy of q
  62   void FlatCopyTo(HMM& t);
  63
  64   // Read an HMM from a HHsearch .hhm file and return 0 at end of file
  65   int Read(FILE* dbf, char* path=NULL);
  66
  67   // Read an HMM from a HMMer .hmm file; return 0 at end of file
  68   int ReadHMMer(FILE* dbf, char* filestr=NULL);
  69
  70   // Read an HMM from a HMMer3 .hmm file; return 0 at end of file
  71   int ReadHMMer3(FILE* dbf, char* filestr=NULL);
  72
  73   // Add transition pseudocounts to HMM
  74   void AddTransitionPseudocounts(float gapd=par.gapd, float gape=par.gape, float gapf=par.gapf, float gapg=par.gapg, float gaph=par.gaph, float gapi=par.gapi, float gapb=par.gapb);
  75
  76   // Use secondary structure-dependent gap penalties on top of the HMM transition penalties
  77   void UseSecStrucDependentGapPenalties();
  78
  79   // Generate an amino acid frequency matrix g[][] with full pseudocount admixture (tau=1)
  80   void PreparePseudocounts();
  81
  82   // Add amino acid pseudocounts to HMM: t.p[i][a] = (1-tau)*f[i][a] + tau*g[i][a]
  83   void AddAminoAcidPseudocounts(char pcm=par.pcm, float pca=par.pca, float pcb=par.pcb, float pcc=par.pcc);
  84
  85   // Add no amino acid pseudocounts to HMM: copy  t.p[i][a] = f[i][a]
  86   void NoAminoAcidPseudocounts() {for(int i=1; i<=L; i++) for(int a=0; a<20; a++) p[i][a]=f[i][a];};
  87
  88   // Factor Null model into HMM t
  89   void IncludeNullModelInHMM(HMM& q, HMM& t);
  90
  91   // Write HMM to output file
  92   void WriteToFile(char* outfile);
  93
  94   // Insert calibration line 'EVD   lamda   mu      hashvalue' into HMM file
  95   void InsertCalibration(char* infile);
  96
  97   // Write HMM to output file in HMMER format
  98   void WriteToFileHMMER(char* outfile);
  99
 100   // Transform log to lin transition probs
 101   void Log2LinTransitionProbs(float beta=1.0);
 102
 103   // Set query columns in His-tags etc to Null model distribution
 104   void NeutralizeTags();
 105
 106   // Calculate effective number of sequences using profiles INCLUDING pseudocounts
 107   float CalcNeff();
 108
 109   // Calculate consensus of HMM (needed to merge HMMs later)
 110   void CalculateConsensus();
 111
 112   // Store linear transition probabilities
 113   void StoreLinearTransitionProbs();
 114
 115   // Initialize f[i][a] with query HMM
 116   void MergeQueryHMM(HMM& q, float wk[]);
 117
 118   // Normalize probabilities in total merged super-HMM
 119   void NormalizeHMMandTransitionsLin2Log();
 120
 121   // Rescale rate matrices P[a][b], R[a][b] according to HMM av. aa composition in pav[a]
 122   void RescaleMatrix();
 123
 124 #ifdef CLUSTALO
 125   void ClobberGlobal(void);
 126   char cQT; /* query or template */
 127 #endif
 128
 129 private:
 130   float** f;                // f[i][a] = prob of finding amino acid a in column i WITHOUT pseudocounts
 131   float** g;                // f[i][a] = prob of finding amino acid a in column i WITH pseudocounts
 132   float** p;                // p[i][a] = prob of finding amino acid a in column i WITH OPTIMUM pseudocounts
 133   float** tr;               // log2 of transition probabilities M2M M2I M2D I2M I2I D2M D2D M2M_GAPOPEN GAPOPEN GAPEXTD
 134 /*   float** tr_lin;           // transition probs in log space */
 135   char trans_lin;           // transition probs are given in log or lin space? (0: p_tr  1: log(p_tr)
 136
 137   char* ss_dssp;            // secondary structure determined by dssp 0:-  1:H  2:E  3:C  4:S  5:T  6:G  7:B
 138   char* sa_dssp;            // solvent accessibility state determined by dssp 0:-  1:A (absolutely buried) 2:B  3:C  4:D  5:E (exposed)
 139   char* ss_pred;            // predicted secondary structure          0:-  1:H  2:E  3:C
 140   char* ss_conf;            // confidence value of prediction         0:-  1:0 ... 10:9
 141   char* Xcons;              // consensus sequence in internal representation (A=0 R=1 N=2 D=3 ...)
 142   float pav[NAA];           // pav[a] = average freq of amino acids in HMM (including subst matrix pseudocounts)
 143   float pnul[NAA];          // null model probabilities used in comparison (only set in template/db HMMs)
 144   int* l;                   // l[i] = pos. of j'th match state in aligment
 145 /*   char trans_lin;           // transition probs are given in log or lin space? (0: p_tr  1: log(p_tr)  */
 146
 147   // Utility for Read()
 148   int Warning(FILE* dbf, char line[], char name[])
 149     {
 150       if (v) cerr<<"\nWARNING: could not read line\n\'"<<line<<"\'\nin HMM "<<name<<" in "<<file<<"\n";
 151       while (fgetline(line,LINELEN,dbf) && !(line[0]=='/' && line[1]=='/'));
 152       if (line) return 2;  //return status: skip HMM
 153       return 0;            //return status: end of database file
 154     }
 155
 156   friend class Hit;
 157   friend class Alignment;
 158   friend class HMMshadow;
 159 };
 160
 161 class HMMshadow {
 162
 163  public:
 164     float *Neff_M;
 165     float *Neff_I;
 166     float *Neff_D;
 167     float **f;
 168     float **g;
 169     float **p;
 170     float **tr;
 171     float pav[20];
 172
 173     void copyHMMtoShadow(const HMM &hmm) {
 174         Neff_M = hmm.Neff_M;
 175         Neff_I = hmm.Neff_I;
 176         Neff_D = hmm.Neff_D;
 177         f  = hmm.f;
 178         g  = hmm.g;
 179         p  = hmm.p;
 180         tr = hmm.tr;
 181         memcpy(pav, hmm.pav, 20*sizeof(float));
 182     }
 183
 184     void copyShadowToHMM(const HMM &hmm, const hmm_light rShadow) {
 185
 186         int i, j;
 187
 188         for (i = 0; i < rShadow.L+1; i++){
 189             hmm.Neff_M[i] = rShadow.Neff_M[i];
 190             hmm.Neff_I[i] = rShadow.Neff_I[i];
 191             hmm.Neff_D[i] = rShadow.Neff_D[i];
 192             for (j = 0; j < 20; j++){
 193                 hmm.f[i][j] = rShadow.f[i][j];
 194                 hmm.g[i][j] = rShadow.g[i][j];
 195                 hmm.p[i][j] = rShadow.p[i][j];
 196             }
 197             for (j = 0; j < 7; j++){
 198                 hmm.tr[i][j] = rShadow.tr[i][j];
 199             }
 200             memcpy((void *)hmm.pav, rShadow.pav, 20*sizeof(float));
 201         }
 202     } /* this is the end of copyShadowToHMM() */
 203
 204 } /* class HMMshadow */;