binaries/src/clustalo/src/hhalign/hhhmm-C.h

   1 /* -*- mode: c; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
   2
   3 /*********************************************************************
   4  * Clustal Omega - Multiple sequence alignment
   5  *
   6  * Copyright (C) 2010 University College Dublin
   7  *
   8  * Clustal-Omega is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU General Public License as
  10  * published by the Free Software Foundation; either version 2 of the
  11  * License, or (at your option) any later version.
  12  *
  13  * This file is part of Clustal-Omega.
  14  *
  15  ********************************************************************/
  16
  17 /*
  18  * RCS $Id: hhhmm-C.h 316 2016-12-16 16:14:39Z fabian $
  19  */
  20
  21
  22 // hhhmm.C
  23
  24 #ifndef MAIN
  25 #define MAIN
  26 #include <iostream>   // cin, cout, cerr
  27 #include <fstream>    // ofstream, ifstream
  28 #include <stdio.h>    // printf
  29 using std::cout;
  30 using std::cerr;
  31 using std::endl;
  32 using std::ios;
  33 using std::ifstream;
  34 using std::ofstream;
  35 #include <stdlib.h>   // exit
  36 #include <string>     // strcmp, strstr
  37 #include <math.h>     // sqrt, pow
  38 #include <limits.h>   // INT_MIN
  39 #include <float.h>    // FLT_MIN
  40 #include <time.h>     // clock
  41 #include <ctype.h>    // islower, isdigit etc
  42 #include "util-C.h"     // imax, fmax, iround, iceil, ifloor, strint, strscn, strcut, substr, uprstr, uprchr, Basename etc.
  43 #include "list.h"     // list data structure
  44 #include "hash.h"     // hash data structure
  45 #include "hhdecl-C.h"
  46 #include "hhutil-C.h"   // imax, fmax, iround, iceil, ifloor, strint, strscn, strcut, substr, uprstr, uprchr, Basename etc.
  47 #endif
  48 //#include "new_new.h" /* memory tracking */
  49
  50 // #ifndef WNLIB
  51 // #define WNLIB
  52 // #include "wnconj.h"   // Will Naylor's wnlib for optimization in C
  53 // #endif
  54
  55 //////////////////////////////////////////////////////////////////////////////
  56 //// Class HMM
  57 //////////////////////////////////////////////////////////////////////////////
  58
  59 //////////////////////////////////////////////////////////////////////////////
  60 // Object constructor
  61 //////////////////////////////////////////////////////////////////////////////
  62 HMM::HMM(int maxseqdis, int maxres)
  63 {
  64     sname = new char*[maxseqdis]();   // names of stored sequences
  65     for (int i = 0; i < maxseqdis; i++){ sname[i] = NULL;}
  66     seq = new char*[maxseqdis]();     // residues of stored sequences (first at pos 1!)
  67     for (int i = 0; i < maxseqdis; i++){ seq[i] = NULL;}
  68     Neff_M = new float[maxres]();     // Neff_M[i] = diversity of subalignment of seqs that have residue in col i
  69     Neff_I = new float[maxres]();     // Neff_I[i] = diversity of subalignment of seqs that have insert in col i
  70     Neff_D = new float[maxres]();     // Neff_D[i] = diversity of subalignment of seqs that have delete in col i
  71     longname = new char[DESCLEN]();   // Full name of first sequence of original alignment (NAME field)
  72     ss_dssp = new char[maxres]();     // secondary structure determined by dssp 0:-  1:H  2:E  3:C  4:S  5:T  6:G  7:B
  73     sa_dssp = new char[maxres]();     // solvent accessibility state determined by dssp 0:-  1:A (absolutely buried) 2:B  3:C  4:D  5:E (exposed)
  74     ss_pred = new char[maxres]();     // predicted secondary structure          0:-  1:H  2:E  3:C
  75     ss_conf = new char[maxres]();     // confidence value of prediction         0:-  1:0 ... 10:9
  76     Xcons   = NULL;                 // create only when needed: consensus sequence in internal representation (A=0 R=1 N=2 D=3 ...)
  77     l = new int[maxres]();            // l[i] = pos. of j'th match state in aligment
  78     /* FS introduced sentinel, NULL terminates loop in destructor, FS, r221->222 */
  79     f = new float*[maxres+1]();   f[maxres] = NULL;  // f[i][a] = prob of finding amino acid a in column i WITHOUT pseudocounts
  80     g = new float*[maxres+1]();   g[maxres] = NULL;  // f[i][a] = prob of finding amino acid a in column i WITH pseudocounts
  81     p = new float*[maxres+1]();   p[maxres] = NULL;  // p[i][a] = prob of finding amino acid a in column i WITH OPTIMUM pseudocounts
  82     tr = new float*[maxres+1](); tr[maxres] = NULL;  // log2 of transition probabilities M2M M2I M2D I2M I2I D2M D2D M2M_GAPOPEN GAPOPEN GAPEXTD
  83     //   tr_lin = new float*[maxres];    // linear transition probabilities M2M M2I M2D I2M I2I D2M D2D M2M_GAPOPEN GAPOPEN GAPEXTD
  84     for (int i=0; i<maxres; i++) {f[i]=new(float[NAA+3])();}
  85     for (int i=0; i<maxres; i++) {g[i]=new(float[NAA])();}
  86     for (int i=0; i<maxres; i++) {p[i]=new(float[NAA])();}
  87     for (int i=0; i<maxres; i++) {tr[i]=new(float[NTRANS])();}
  88     //   for (int i=0; i<maxres; i++) tr_lin[i]=new(float[NTRANS]);
  89     L=0;
  90     Neff_HMM=0;
  91     n_display=N_in=N_filtered=0;
  92     nss_dssp=nsa_dssp=nss_pred=nss_conf=nfirst=ncons=-1;
  93     //   lamda_hash.New(37,0.0); // Set size and NULL element for hash
  94     //   mu_hash.New(37,0.0);    // Set size and NULL element for hash
  95     lamda=0.0; mu=0.0;
  96     name[0]=longname[0]=fam[0]='\0';
  97     trans_lin=0; // transition probs in log space
  98 }
  99
 100
 101 //////////////////////////////////////////////////////////////////////////////
 102 // Object destructor
 103 //////////////////////////////////////////////////////////////////////////////
 104 HMM::~HMM()
 105 {
 106   //Delete name and seq matrices
 107     if (NULL != sname){
 108         for (int k=0; (k < n_display) && (NULL != sname[k]); k++){
 109             delete[] sname[k]; sname[k] = NULL;
 110         }
 111         delete[] sname;    sname = NULL;
 112     }
 113     if (NULL != seq){
 114         for (int k=0; (k < n_display) && (NULL != seq[k]); k++){
 115             delete[] seq[k];  seq[k] = NULL;
 116         }
 117         delete[] seq;    seq = NULL;
 118     }
 119   delete[] Neff_M;    Neff_M = NULL;
 120   delete[] Neff_D;    Neff_D = NULL;
 121   delete[] Neff_I;    Neff_I = NULL;
 122   delete[] longname;    longname = NULL;
 123   delete[] ss_dssp;  ss_dssp = NULL;
 124   delete[] sa_dssp;  sa_dssp = NULL;
 125   delete[] ss_pred;  ss_pred = NULL;
 126   delete[] ss_conf;  ss_conf = NULL;
 127   delete[] Xcons;  Xcons = NULL;
 128   delete[] l;  l = NULL;
 129   for (int i=0; i</*MAXRES*/par.maxResLen; i++){
 130     if (f[i]){
 131       delete[] f[i];   f[i] = NULL;
 132     }
 133     else break;
 134   }
 135   for (int i=0; i</*MAXRES*/par.maxResLen; i++){
 136     if (g[i]){
 137       delete[] g[i];   g[i] = NULL;
 138     }
 139     else break;
 140   }
 141   for (int i=0; i</*MAXRES*/par.maxResLen; i++){
 142     if (p[i]){
 143       delete[] p[i];  p[i] = NULL;
 144     }
 145     else break;
 146   }
 147   for (int i=0; i</*MAXRES*/par.maxResLen; i++){
 148     if (tr[i]){
 149       delete[] tr[i];  tr[i] = NULL;
 150     }
 151     else break;
 152   }
 153   //   for (int i=0; i</*MAXRES*/par.maxResLen; i++) if (tr_lin[i]) delete[] tr_lin[i]; else break;
 154   delete[] f;  f = NULL;
 155   delete[] g;  g = NULL;
 156   delete[] p;  p = NULL;
 157   delete[] tr;  tr = NULL;
 158 //   delete[] tr_lin;
 159 }
 160
 161 //////////////////////////////////////////////////////////////////////////////
 162 // Deep-copy constructor
 163 //////////////////////////////////////////////////////////////////////////////
 164 HMM& HMM::operator=(HMM& q)
 165 {
 166   L=q.L;
 167   for (int i=0; i<=L+1; ++i)
 168     {
 169       for (int a=0; a<NAA; ++a)
 170         {
 171           f[i][a]=q.f[i][a];
 172           g[i][a]=q.g[i][a];
 173           p[i][a]=q.p[i][a];
 174         }
 175       for (int a=0; a<NTRANS; ++a)
 176         tr[i][a]=q.tr[i][a];
 177       ss_dssp[i]=q.ss_dssp[i];
 178       sa_dssp[i]=q.sa_dssp[i];
 179       ss_pred[i]=q.ss_pred[i];
 180       ss_conf[i]=q.ss_conf[i];
 181       l[i]=q.l[i];
 182     }
 183   if (q.Xcons)
 184     for (int i=0; i<=L+1; ++i)
 185       Xcons[i]  =q.Xcons[i];
 186
 187   n_display=q.n_display;
 188   for (int k=0; k<n_display; k++) {
 189     sname[k]=new char[strlen(q.sname[k])+1];
 190     if (!sname[k]) MemoryError("array of names for sequences to display");
 191     strcpy(sname[k],q.sname[k]);
 192   }
 193   for (int k=0; k<n_display; k++) {
 194     seq[k]=new char[strlen(q.seq[k])+1];
 195     if (!seq[k]) MemoryError("array of names for sequences to display");
 196     strcpy(seq[k],q.seq[k]);
 197   }
 198   ncons=q.ncons;
 199   nfirst=q.nfirst;
 200   nss_dssp=q.nss_dssp;
 201   nsa_dssp=q.nsa_dssp;
 202   nss_pred=q.nss_pred;
 203   nss_conf=q.nss_conf;
 204
 205   for (int i=0; i<=L+1; ++i) Neff_M[i]=q.Neff_M[i];
 206   for (int i=0; i<=L+1; ++i) Neff_I[i]=q.Neff_I[i];
 207   for (int i=0; i<=L+1; ++i) Neff_D[i]=q.Neff_D[i];
 208   Neff_HMM=q.Neff_HMM;
 209
 210   strcpy(longname,q.longname);
 211   strcpy(name,q.name);
 212   strcpy(fam,q.fam);
 213   strcpy(sfam,q.sfam);
 214   strcpy(fold,q.fold);
 215   strcpy(cl,q.cl);
 216   strcpy(file,q.file);
 217
 218   lamda=q.lamda;
 219   mu=q.mu;
 220
 221   for (int a=0; a<NAA; ++a) pav[a]=q.pav[a];
 222   N_in=q.N_in;
 223   N_filtered=q.N_filtered;
 224   trans_lin=q.trans_lin;
 225   return (HMM&) (*this);
 226 }
 227
 228
 229 ///////////////////////////////////////////////////////////////////////////////
 230 /**
 231  * @brief Read an HMM from an HHsearch .hhm file; return 0 at end of file
 232  */
 233 int
 234 HMM::Read(FILE* dbf, char* path)
 235 {
 236   char line[LINELEN]="";    // input line
 237   char str3[8]="",str4[8]=""; // first 3 and 4 letters of input line
 238   char* ptr;                // pointer for string manipulation
 239   int i=0;                  // index for match state (first=1)
 240   int a;                    // amino acid index
 241   static int warn=0;
 242
 243   trans_lin=0;
 244   L=0;
 245   Neff_HMM=0;
 246   n_display=N_in=N_filtered=0;
 247   nss_dssp=nsa_dssp=nss_pred=nss_conf=nfirst=ncons=-1;
 248   lamda=mu=0.0;
 249   trans_lin=0; // transition probs in log space
 250   name[0]=longname[0]=fam[0]='\0';
 251   //If at the end of while-loop L is still 0 then we have reached end of db file
 252
 253   //Do not delete name and seq vectors because their adresses are transferred to hitlist as part of a hit!!
 254
 255   while (fgetline(line,LINELEN-1,dbf) && !(line[0]=='/' && line[1]=='/'))
 256     {
 257
 258       if (strscn(line)==NULL) continue;    // skip lines that contain only white space
 259       substr(str3,line,0,2);               // copy the first three characters into str3
 260       substr(str4,line,0,3);               // copy the first four characters into str4
 261
 262       if (!strncmp("HH",line,2)) continue;
 263
 264       if (!strcmp("NAME",str4))
 265         {
 266           ptr=strscn(line+4);              //advance to first non-white-space character
 267           if (ptr)
 268             {
 269               strncpy(longname,ptr,DESCLEN-1); //copy full name to longname
 270               longname[DESCLEN-1]='\0';
 271               strncpy(name,ptr,NAMELEN-1);     //copy longname to name...
 272               strcut(name);                    //...cut after first word...
 273             }
 274           else
 275             {
 276               strcpy(longname,"undefined");
 277               strcpy(name,"undefined");
 278             }
 279           if (v>=4) cout<<"Reading in HMM "<<name<<":\n";
 280         }
 281
 282       else if (!strcmp("FAM",str3))
 283         {
 284           ptr=strscn(line+3);              //advance to first non-white-space character
 285           if (ptr) strncpy(fam,ptr,IDLEN-1); else strcpy(fam,""); //copy family name to basename
 286           ScopID(cl,fold,sfam,fam);        //get scop classification from basename (e.g. a.1.2.3.4)
 287         }
 288
 289       else if (!strcmp("FILE",str4))
 290         {
 291           if (path) strncpy(file,path,NAMELEN-1); else *file='\0'; // copy path to file variable
 292           ptr=strscn(line+4);              //advance to first non-white-space character
 293           if (ptr)
 294             strncat(file,ptr,NAMELEN-1-strlen(file));   // append file name read from file to path
 295           else strcat(file,"*");
 296         }
 297
 298       else if (!strcmp("LENG",str4))
 299         {
 300           ptr=line+4;
 301           L=strint(ptr);                   //read next integer (number of match states)
 302         }
 303       else if (!strcmp("FILT",str4) || !strcmp("NSEQ",str4))
 304         {
 305           ptr=line+4;
 306           N_filtered=strint(ptr);          //read next integer: number of sequences after filtering
 307           N_in=strint(ptr);                //read next integer: number of sequences in alignment
 308         }
 309
 310       else if (!strcmp("NEFF",str4) || !strcmp("NAA",str3)) sscanf(line+6,"%f",&Neff_HMM);
 311
 312       else if (!strcmp("EVD",str3))
 313         {
 314 //        char key[IDLEN];
 315           sscanf(line+6,"%f %f",&lamda,&mu);
 316 //        sscanf(line+22,"%s",key);
 317 //        lamda_hash.Add(key,lamda);
 318 //        mu_hash.Add(key,mu);
 319         }
 320
 321       else if (!strcmp("DESC",str4)) continue;
 322       else if (!strcmp("COM",str3))  continue;
 323       else if (!strcmp("DATE",str4)) continue;
 324
 325       /////////////////////////////////////////////////////////////////////////////////////
 326       // Read template sequences that should get displayed in output alignments
 327       else if (!strcmp("SEQ",str3))
 328         {
 329         //char cur_seq[MAXCOL]=""; //Sequence currently read in
 330         char *cur_seq = new char[par.maxColCnt]; //Sequence currently read in
 331           int k;                // sequence index; start with -1; after reading name of n'th sequence-> k=n
 332           int h;                // index for character in input line
 333           int l=1;              // index of character in sequence seq[k]
 334           int i=1;              // index of match states in ss_dssp[i] and ss_pred[i] sequence
 335           int n_seq=0;          // number of sequences to be displayed EXCLUDING ss sequences
 336           cur_seq[0]='-';       // overwrite '\0' character at beginning to be able to do strcpy(*,cur_seq)
 337           k=-1;
 338           while (fgetline(line,LINELEN-1,dbf) && line[0]!='#')
 339             {
 340               if (v>=4) cout<<"Read from file:"<<line<<"\n"; //DEBUG
 341               if (line[0]=='>') //line contains sequence name
 342                 {
 343                   if (k>=MAXSEQDIS-1) //maximum number of allowable sequences exceeded
 344                     {while (fgetline(line,LINELEN-1,dbf) && line[0]!='#'); break;}
 345                   k++;
 346                   if      (!strncmp(line,">ss_dssp",8)) nss_dssp=k;
 347                   else if (!strncmp(line,">sa_dssp",8)) nsa_dssp=k;
 348                   else if (!strncmp(line,">ss_pred",8)) nss_pred=k;
 349                   else if (!strncmp(line,">ss_conf",8)) nss_conf=k;
 350                   else if (!strncmp(line,">Cons-",6) || !strncmp(line,">Consensus",10)) ncons=k;
 351                   else
 352                     {
 353                       if (nfirst==-1) nfirst=k;
 354                       if (n_seq>=par.nseqdis)
 355                         {while (fgetline(line,LINELEN-1,dbf) && line[0]!='#'); k--; break;}
 356                       n_seq++;
 357                     }
 358
 359                   //If this is not the first sequence then store residues of previous sequence
 360                   if (k>0) {
 361                     seq[k-1]=new char[strlen(cur_seq)+1];
 362                     if (!seq[k-1]) MemoryError("array of sequences to display");
 363                     strcpy(seq[k-1],cur_seq);
 364                   }
 365
 366                   // store sequence name
 367                   strcut(line+1); //find next white-space character and overwrite it with end-of-string character
 368                   sname[k] = new char[strlen(line+1)+1]; //+1 for terminating '\0'
 369                   if (!sname[k]) MemoryError("array of names for sequences to display");
 370                   strcpy(sname[k],line+1);           //store sequence name in **name
 371                   l=1; i=1;
 372                 }
 373               else //line contains sequence residues
 374                 {
 375                   if (k==-1)
 376                     {
 377                       cerr<<endl<<"WARNING: Ignoring following line while reading HMM"<<name<<":\n\'"<<line<<"\'\n";
 378                       continue;
 379                     }
 380
 381                   h=0; //counts characters in current line
 382
 383                   // Check whether all characters are correct; store into cur_seq
 384                   if (k==nss_dssp) // lines with dssp secondary structure states (. - H E C S T G B)
 385                     {
 386                       while (h<LINELEN && line[h]>'\0' && l</*MAXCOL*/par.maxColCnt-1)
 387                         {
 388                           if (ss2i(line[h])>=0 && line[h]!='.')
 389                             {
 390                               char c=ss2ss(line[h]);
 391                               cur_seq[l]=c;
 392                               if (c!='.' && !(c>='a' && c<='z')) ss_dssp[i++]=ss2i(c);
 393                               l++;
 394                             }
 395                           else if (v && ss2i(line[h])==-2)
 396                             cerr<<endl<<"WARNING: invalid symbol \'"<<line[h]<<"\' at pos. "<<h<<" in line '"<<line<<"' of HMM "<<name<<"\n";
 397                           h++;
 398                         }
 399                     }
 400                   if (k==nsa_dssp) // lines with dssp secondary solvent accessibility (- A B C D E)
 401                     {
 402                       while (h<LINELEN && line[h]>'\0' && l</*MAXCOL*/par.maxColCnt-1)
 403                         {
 404                           if (sa2i(line[h])>=0)
 405                             {
 406                               char c=line[h];
 407                               cur_seq[l]=c;
 408                               if (c!='.' && !(c>='a' && c<='z')) sa_dssp[i++]=sa2i(c);
 409                               l++;
 410                             }
 411                           else if (v && sa2i(line[h])==-2)
 412                             cerr<<endl<<"WARNING: invalid symbol \'"<<line[h]<<"\' at pos. "<<h<<" in line '"<<line<<"' of HMM "<<name<<"\n";
 413                           h++;
 414                         }
 415                     }
 416                   else if (k==nss_pred) // lines with predicted secondary structure (. - H E C)
 417                     {
 418                       while (h<LINELEN && line[h]>'\0' && l</*MAXCOL*/par.maxColCnt-1)
 419                         {
 420                           if (ss2i(line[h])>=0 && ss2i(line[h])<=3 && line[h]!='.')
 421                             {
 422                               char c=ss2ss(line[h]);
 423                               cur_seq[l]=c;
 424                               if (c!='.' && !(c>='a' && c<='z')) ss_pred[i++]=ss2i(c);
 425                               l++;
 426                             }
 427                           else if (v && ss2i(line[h])==-2)
 428                             cerr<<endl<<"WARNING: invalid symbol \'"<<line[h]<<"\' at pos. "<<h<<" in line '"<<line<<"' of HMM "<<name<<"\n";
 429                           h++;
 430                         }
 431                     }
 432                   else if (k==nss_conf) // lines with confidence values should contain only 0-9, '-', or '.'
 433                     {
 434                       while (h<LINELEN && line[h]>'\0' && l</*MAXCOL*/par.maxColCnt-1)
 435                         {
 436                           if (line[h]=='-' || (line[h]>='0' && line[h]<='9'))
 437                             {
 438                               cur_seq[l]=line[h];
 439                               ss_conf[l]=cf2i(line[h]);
 440                               l++;
 441                             }
 442                           else if (v && cf2i(line[h])==-2)
 443                             cerr<<endl<<"WARNING: invalid symbol \'"<<line[h]<<"\' at pos. "<<h<<" in line '"<<line<<"' of HMM "<<name<<"\n";
 444                           h++;
 445                         }
 446                     }
 447                   else // normal line containing residues
 448                     {
 449                       while (h<LINELEN && line[h]>'\0' && l</*MAXCOL*/par.maxColCnt-1)
 450                         {
 451                           if (aa2i(line[h])>=0 && line[h]!='.') // ignore '.' and white-space characters ' ', \t and \n (aa2i()==-1)
 452                             {cur_seq[l]=line[h]; l++;}
 453                           else if (aa2i(line[h])==-2 && v)
 454                             cerr<<endl<<"WARNING: invalid symbol \'"<<line[h]<<"\' at pos. "<<h<<" in line '"<<line<<"' of HMM "<<name<<"\n";
 455                           h++;
 456                         }
 457                     }
 458                   cur_seq[l]='\0';  //Ensure that cur_seq ends with a '\0' character
 459
 460                 } //end else
 461             } //while(getline)
 462           //If this is not the first sequence some residues have already been read in
 463           if (k>=0) {
 464             seq[k]=new char[strlen(cur_seq)+1];
 465             if (!seq[k]) MemoryError("array of sequences to display");
 466             strcpy(seq[k],cur_seq);
 467           }
 468           n_display=k+1;
 469
 470           // DEBUG
 471           if (v>=4)
 472             {
 473               printf("nss_dssp=%i  nsa_dssp=%i  nss_pred=%i  nss_conf=%i  nfirst=%i\n",nss_dssp,nsa_dssp,nss_pred,nss_conf,nfirst);
 474               for (k=0; k<n_display; k++)
 475                 {
 476                   int j;
 477                   cout<<">"<<sname[k]<<"(k="<<k<<")\n";
 478                   if      (k==nss_dssp) {for (j=1; j<=L; j++) cout<<char(i2ss(ss_dssp[j]));}
 479                   else if (k==nsa_dssp) {for (j=1; j<=L; j++) cout<<char(i2sa(sa_dssp[j]));}
 480                   else if (k==nss_pred) {for (j=1; j<=L; j++) cout<<char(i2ss(ss_pred[j]));}
 481                   else if (k==nss_conf) {for (j=1; j<=L; j++) cout<<int(ss_conf[j]-1);}
 482                   else                  {for (j=1; j<=L; j++) cout<<seq[k][j];}
 483                   cout<<"\n";
 484                 }
 485             }
 486
 487         } //end if("SEQ")
 488
 489       /////////////////////////////////////////////////////////////////////////////////////
 490       // Read average amino acid frequencies for HMM
 491       else if (!strcmp("FREQ",str4))
 492         {
 493           fprintf(stderr,"Error: hhm file has obsolete format.\n");
 494           fprintf(stderr,"Please use hhmake version > 1.1 to generate hhm files.\n");
 495           exit(1);
 496         }
 497
 498       else if (!strcmp("AVER",str4)) {} // AVER line scrapped
 499       else if (!strcmp("NULL",str4))
 500         {
 501           ptr=line+4;
 502           for (a=0; a<20 && ptr; ++a)
 503             //s2[a]: transform amino acids Sorted by alphabet -> internal numbers for amino acids
 504             pb[s2a[a]] = (float) fpow2(float(-strinta(ptr))/HMMSCALE);
 505           if (!ptr) return Warning(dbf,line,name);
 506           if (v>=4)
 507             {
 508               printf("\nNULL  ");
 509               for (a=0; a<20; ++a) printf("%5.1f ",100.*pb[s2a[a]]);
 510               printf("\n");
 511             }
 512         }
 513
 514       /////////////////////////////////////////////////////////////////////////////////////
 515       // Read transition probabilities from start state
 516       else if (!strcmp("HMM",str3))
 517         {
 518           fgetline(line,LINELEN-1,dbf); // Skip line with amino acid labels
 519           fgetline(line,LINELEN-1,dbf); // Skip line with transition labels
 520           ptr=line;
 521           for (a=0; a<=D2D && ptr; ++a)
 522             tr[0][a] = float(-strinta(ptr))/HMMSCALE; //store transition probabilites as log2 values
 523             // strinta returns next integer in string and puts ptr to first char
 524             // after the integer. Returns -99999 if '*' is found.
 525             // ptr is set to 0 if no integer is found after ptr.
 526           Neff_M[0] = float(strinta(ptr))/HMMSCALE;  // Read eff. number of sequences with M->? transition
 527           Neff_I[0] = float(strinta(ptr))/HMMSCALE;  // Read eff. number of sequences with I->? transition
 528           Neff_D[0] = float(strinta(ptr))/HMMSCALE;  // Read eff. number of sequences with D->? transition
 529           if (!ptr) return Warning(dbf,line,name);
 530
 531           /////////////////////////////////////////////////////////////////////////////////////
 532           // Read columns of HMM
 533           int next_i=0;  // index of next column
 534           while (fgetline(line,LINELEN-2,dbf) &&  !(line[0]=='/' && line[1]=='/') && line[0]!='#')
 535             {
 536               if (strscn(line)==NULL) continue; // skip lines that contain only white space
 537
 538               // Read in AA probabilities
 539               ptr=line+1;
 540               int prev_i = next_i;
 541               next_i = strint(ptr); ++i;
 542               if (v && next_i!=prev_i+1)
 543                 if (++warn<=5)
 544                   {
 545                     cerr<<endl<<"WARNING: in HMM "<<name<<" state "<<prev_i<<" is followed by state "<<next_i<<"\n";
 546                     if (warn==5) cerr<<endl<<"WARNING: further warnings while reading HMMs will be suppressed.\n";
 547                   }
 548               if (i>L)
 549                 {
 550                   cerr<<endl<<"WARNING: in HMM "<<name<<" there are more columns than the stated length "<<L<<". Skipping HMM\n";
 551                   return 2;
 552                 }
 553               if (i>=/*MAXRES*/par.maxResLen-2)
 554                 {
 555                   fgetline(line,LINELEN-1,dbf); // Skip line
 556                   continue;
 557                 }
 558
 559               for (a=0; a<20 && ptr; ++a)
 560 //              f[i][s2a[a]] = (float)pow(2.,float(-strinta(ptr))/HMMSCALE);
 561                 f[i][s2a[a]] = fpow2(float(-strinta(ptr))/HMMSCALE);       // speed-up ~5 s for 10000 SCOP domains
 562
 563               //s2a[a]: transform amino acids Sorted by alphabet -> internal numbers for amino acids
 564               l[i]=strint(ptr);
 565               if (!ptr) return Warning(dbf,line,name);
 566               if (v>=4)
 567                 {
 568                   printf("%s",line);
 569                   printf("%6i ",i);
 570                   for (a=0; a<20; ++a) printf("%5.1f ",100*f[i][s2a[a]]);
 571                   printf("%5i",l[i]);
 572                   printf("\n");
 573                 }
 574
 575               // Read transition probabilities
 576               fgetline(line,LINELEN-1,dbf); // Skip line with amino acid labels
 577               if (line[0]!=' ' && line[0]!='\t') return Warning(dbf,line,name);
 578               ptr=line;
 579               for (a=0; a<=D2D && ptr; ++a)
 580                 tr[i][a] = float(-strinta(ptr))/HMMSCALE; //store transition prob's as log2-values
 581               Neff_M[i] = float(strinta(ptr))/HMMSCALE;  // Read eff. number of sequences with M->? transition
 582               Neff_I[i] = float(strinta(ptr))/HMMSCALE;  // Read eff. number of sequences with I->? transition
 583               Neff_D[i] = float(strinta(ptr))/HMMSCALE;  // Read eff. number of sequences with D->? transition
 584               if (!ptr) return Warning(dbf,line,name);
 585               if (v>=4)
 586                 {
 587                   printf("       ");
 588                   for (a=0; a<=D2D; ++a) printf("%5.1f ",100*fpow2(tr[i][a]));
 589                   printf("%5.1f %5.1f %5.1f \n",Neff_M[i],Neff_I[i],Neff_D[i]);
 590                 }
 591             }
 592           if (line[0]=='/' && line[1]=='/') break;
 593         }
 594       else if (v) cerr<<endl<<"WARNING: Ignoring line\n\'"<<line<<"\'\nin HMM "<<name<<"\n";
 595
 596     } //while(getline)
 597
 598   if (L==0) return 0; //End of db file -> stop reading in
 599
 600   // Set coefficients of EVD (= 0.0 if not calibrated for these parameters)
 601 //   lamda = lamda_hash.Show(par.Key());
 602 //   mu    = mu_hash.Show(par.Key());
 603   if (lamda && v>=3) printf("HMM %s is already calibrated: lamda=%-5.3f, mu=%-5.2f\n",name,lamda,mu);
 604
 605   if (v && i!=L) cerr<<endl<<"Warning: in HMM "<<name<<" there are only "<<i<<" columns while the stated length is "<<L<<"\n";
 606   if (v && i>/*MAXRES*/par.maxResLen-2) {i=/*MAXRES*/par.maxResLen-2; cerr<<endl<<"WARNING: maximum number "<</*MAXRES*/par.maxResLen-2<<" of residues exceeded while reading HMM "<<name<<"\n";}
 607   if (v && !i)  cerr<<endl<<"WARNING: HMM "<<name<<" contains no match states. Check the alignment that gave rise to this HMM.\n";
 608   if (v>=2) cout<<"Read in HMM "<<name<<" with "<<L<<" match states and effective number of sequences = "<<Neff_HMM<<"\n";
 609   L = i;
 610
 611   // Set emission probabilities of zero'th (begin) state and L+1st (end) state to background probabilities
 612   for (a=0; a<20; ++a) f[0][a]=f[L+1][a]=pb[a];
 613   Neff_M[L+1]=1.0f;
 614   Neff_I[L+1]=Neff_D[L+1]=0.0f;
 615
 616   return 1; //return status: ok
 617
 618 } /* this is the end of HMM::Read() */
 619
 620
 621 /////////////////////////////////////////////////////////////////////////////////////
 622 /**
 623  * @brief Read an HMM from a HMMer .hmm file; return 0 at end of file
 624  */
 625 int
 626 HMM::ReadHMMer(FILE* dbf, char* filestr)
 627 {
 628   char line[LINELEN]="";    // input line
 629   char desc[DESCLEN]="";    // description of family
 630   char str4[5]="";          // first 4 letters of input line
 631   char* ptr;                // pointer for string manipulation
 632   int i=0;                  // index for match state (first=1)
 633   int a;                    // amino acid index
 634   char dssp=0;              // 1 if a consensus SS has been found in the transition prob lines
 635   char annot=0;             // 1 if at least one annotation character in insert lines is ne '-' or ' '
 636   int k=0;                  // index for seq[k]
 637   static char ignore_hmmer_cal = 0;
 638   char* annotchr;           // consensus amino acids in ASCII format, or, in HMMER format, the reference annotation character in insert line
 639   //annotchr = new char[MAXRES]; // consensus amino acids in ASCII format, or, in HMMER format, the reference annotation character in insert line
 640   annotchr = new char[par.maxResLen]; // consensus amino acids in ASCII format, or, in HMMER format, the reference annotation character in insert line
 641   static int warn=0;
 642   int iAlpha = 20; /* size of alphabet, default is protein = 20 */
 643   double dAlphaInv = 1.00 / (double)(iAlpha); /* weight of AA */
 644
 645   trans_lin=0;
 646   L=0;
 647   Neff_HMM=0;
 648   n_display=N_in=N_filtered=0;
 649   nss_dssp=nsa_dssp=nss_pred=nss_conf=nfirst=ncons=-1;
 650   lamda=mu=0.0;
 651   trans_lin=0; // transition probs in log space
 652   name[0]=longname[0]=desc[0]=fam[0]='\0';
 653   //If at the end of while-loop L is still 0 then we have reached end of db file
 654
 655   // Do not delete name and seq vectors because their adresses are transferred to hitlist as part of a hit!!
 656
 657   while (fgetline(line,LINELEN-1,dbf) && !(line[0]=='/' && line[1]=='/'))
 658     {
 659
 660       if (strscn(line)==NULL) continue;   // skip lines that contain only white space
 661       if (!strncmp("HMMER",line,5)) continue;
 662
 663       substr(str4,line,0,3);              // copy the first four characters into str4
 664
 665       if (!strcmp("NAME",str4) && name[0]=='\0')
 666         {
 667           ptr=strscn(line+4);             // advance to first non-white-space character
 668           strncpy(name,ptr,NAMELEN-1);    // copy full name to name
 669           strcut(name);                   // ...cut after first word...
 670           if (v>=4) cout<<"Reading in HMM "<<name<<":\n";
 671         }
 672
 673       else if (!strcmp("ACC ",str4))
 674         {
 675           ptr=strscn(line+4);              // advance to first non-white-space character
 676           strncpy(longname,ptr,DESCLEN-1); // copy Accession id to longname...
 677         }
 678
 679       else if (!strcmp("DESC",str4))
 680         {
 681           ptr=strscn(line+4);             // advance to first non-white-space character
 682           if (ptr)
 683             {
 684               strncpy(desc,ptr,DESCLEN-1);   // copy description to name...
 685               desc[DESCLEN-1]='\0';
 686               strcut(ptr);                   // ...cut after first word...
 687             }
 688           if (!ptr || ptr[1]!='.' || strchr(ptr+3,'.')==NULL) strcpy(fam,""); else strcpy(fam,ptr); // could not find two '.' in name?
 689         }
 690
 691       else if (!strcmp("LENG",str4))
 692         {
 693           ptr=line+4;
 694           L=strint(ptr);                  //read next integer (number of match states)
 695         }
 696
 697       else if (!strcmp("ALPH",str4)) {
 698
 699           ptr=strscn(line+4);
 700
 701           if (0 == strcmp(ptr, "Amino")){
 702               iAlpha = 20;
 703           }
 704           else if (0 == strcmp(ptr, "Nucleic")){
 705               iAlpha = 4;
 706               printf("%s:%s:%d: WARNING: HMM reading does not work for DNA/RNA\n",
 707                      __FUNCTION__, __FILE__, __LINE__);
 708           }
 709           else {
 710               return Warning(dbf,line,name);
 711           }
 712           dAlphaInv = 1.00 / (double)(iAlpha);
 713           //continue;
 714       }
 715       else if (!strcmp("RF  ",str4)) continue;
 716       else if (!strcmp("CS  ",str4)) continue;
 717       else if (!strcmp("MAP ",str4)) continue;
 718       else if (!strcmp("COM ",str4)) continue;
 719       else if (!strcmp("NSEQ",str4))
 720         {
 721           ptr=line+4;
 722           N_in=N_filtered=strint(ptr);    //read next integer: number of sequences after filtering
 723         }
 724
 725       else if (!strcmp("DATE",str4)) continue;
 726       else if (!strncmp("CKSUM ",line,5)) continue;
 727       else if (!strcmp("GA  ",str4)) continue;
 728       else if (!strcmp("TC  ",str4)) continue;
 729       else if (!strcmp("NC  ",str4)) continue;
 730
 731       else if (!strncmp("SADSS",line,5))
 732         {
 733           if (nsa_dssp<0)
 734             {
 735               nsa_dssp=k++;
 736               seq[nsa_dssp] = new char[/*MAXRES*/par.maxResLen+2];
 737               sname[nsa_dssp] = new(char[15]);
 738               strcpy(seq[nsa_dssp]," ");
 739               strcpy(sname[nsa_dssp],"sa_dssp");
 740
 741             }
 742           ptr=strscn(line+5);
 743           if (ptr)
 744             {
 745               strcut(ptr);
 746               if (strlen(seq[nsa_dssp])+strlen(ptr)>=(unsigned)(/*MAXRES*/par.maxResLen))
 747                 printf("\nWARNING: HMM %s has SADSS records with more than %i residues.\n",name,/*MAXRES*/par.maxResLen);
 748               else strcat(seq[nsa_dssp],ptr);
 749             }
 750         }
 751
 752       else if (!strncmp("SSPRD",line,5))
 753         {
 754           if (nss_pred<0)
 755             {
 756               nss_pred=k++;
 757               seq[nss_pred] = new char[/*MAXRES*/par.maxResLen+2];
 758               sname[nss_pred] = new(char[15]);
 759               strcpy(seq[nss_pred]," ");
 760               strcpy(sname[nss_pred],"ss_pred");
 761
 762             }
 763           ptr=strscn(line+5);
 764           if (ptr)
 765             {
 766               strcut(ptr);
 767               if (strlen(seq[nss_pred])+strlen(ptr)>=(unsigned)(/*MAXRES*/par.maxResLen))
 768                 printf("\nWARNING: HMM %s has SSPRD records with more than %i residues.\n",name,/*MAXRES*/par.maxResLen);
 769               else strcat(seq[nss_pred],ptr);
 770             }
 771         }
 772
 773       else if (!strncmp("SSCON",line,5))
 774         {
 775           if (nss_conf<0)
 776             {
 777               nss_conf=k++;
 778               seq[nss_conf] = new char[/*MAXRES*/par.maxResLen+2];
 779               sname[nss_conf] = new(char[15]);
 780               strcpy(seq[nss_conf]," ");
 781               strcpy(sname[nss_conf],"ss_conf");
 782             }
 783           ptr=strscn(line+5);
 784           if (ptr)
 785             {
 786               strcut(ptr);
 787               if (strlen(seq[nss_conf])+strlen(ptr)>=(unsigned)(/*MAXRES*/par.maxResLen))
 788                 printf("\nWARNING: HMM %s has SSPRD records with more than %i residues.\n",name,/*MAXRES*/par.maxResLen);
 789               else strcat(seq[nss_conf],ptr);
 790             }
 791         }
 792
 793       else if (!strncmp("SSCIT",line,5)) continue;
 794       else if (!strcmp("XT  ",str4)) continue;
 795       else if (!strcmp("NULT",str4)) continue;
 796
 797       else if (!strcmp("NULE",str4))
 798         {
 799           ptr=line+4;
 800           for (a=0; (a < iAlpha) && ptr; ++a){
 801             /* FIXME: FS introduced alphabet size (was '20')
 802                and dAlphaInv (was '0.05' = 1/20) */
 803             //s2a[a]: transform amino acids Sorted by alphabet -> internal numbers for amino acids
 804             pb[s2a[a]] = (float) dAlphaInv * fpow2(float(strinta(ptr,-99999))/HMMSCALE);  /* dAlphaInv */
 805           }
 806           for (a = iAlpha; a < 20; a++){
 807             pb[s2a[a]] = 0.0;
 808           }
 809           if (!ptr) return Warning(dbf,line,name);
 810           if (v>=4)
 811             {
 812               printf("\nNULL  ");
 813               for (a=0; a<iAlpha; ++a) { /* FIXME: FS introduced iAlpha, was '20' */
 814                 printf("%5.1f ",100.*pb[s2a[a]]);
 815               }
 816               printf("\n");
 817             }
 818         }
 819
 820       else if (!strcmp("EVD ",str4))
 821         {
 822           char* ptr=line+4;
 823           ptr = strscn(ptr);
 824           sscanf(ptr,"%f",&lamda);
 825           ptr = strscn(ptr);
 826           sscanf(ptr,"%f",&mu);
 827           if (lamda<0)
 828             {
 829               if (v>=2 && ignore_hmmer_cal==0)
 830                 cerr<<endl<<"Warning: some HMMs have been calibrated with HMMER's 'hmmcalibrate'. These calibrations will be ignored\n";
 831               ignore_hmmer_cal=1;
 832               mu = lamda = 0.0;
 833             }
 834         }
 835
 836       /////////////////////////////////////////////////////////////////////////////////////
 837       // Read transition probabilities from start state
 838       else if (!strncmp("HMM",line,3))
 839         {
 840           fgetline(line,LINELEN-1,dbf); // Skip line with amino acid labels
 841           fgetline(line,LINELEN-1,dbf); // Skip line with transition labels
 842           ptr=line;
 843           for (a=0; a<=M2D && ptr; ++a)
 844             tr[0][a] = float(strinta(ptr,-99999))/HMMSCALE; //store transition probabilites as log2 values
 845             // strinta returns next integer in string and puts ptr to first char
 846             // after the integer. Returns -99999 if '*' is found.
 847             // ptr is set to 0 if no integer is found after ptr.
 848           tr[0][I2M] = tr[0][D2M] = 0.0;
 849           tr[0][I2I] = tr[0][D2D] = -99999.0;
 850           if (!ptr) return Warning(dbf,line,name);
 851           if (v>=4)
 852             {
 853               printf("       ");
 854               for (a=0; a<=D2D && ptr; ++a) printf("%5.1f ",100*fpow2(tr[i][a]));
 855               printf("\n");
 856             }
 857
 858           // Prepare to store DSSP states (if there are none, delete afterwards)
 859           nss_dssp=k++;
 860           seq[nss_dssp] = new char[/*MAXRES*/par.maxResLen+2];
 861           sname[nss_dssp] = new(char[15]);
 862           strcpy(sname[nss_dssp],"ss_dssp");
 863
 864           /////////////////////////////////////////////////////////////////////////////////////
 865           // Read columns of HMM
 866           int next_i=0;  // index of next column
 867           while (fgetline(line,LINELEN-1,dbf) &&  !(line[0]=='/' && line[1]=='/') && line[0]!='#')
 868             {
 869               if (strscn(line)==NULL) continue; // skip lines that contain only white space
 870
 871               // Read in AA probabilities
 872               ptr=line;
 873               int prev_i = next_i;
 874               next_i = strint(ptr); ++i;
 875               if (v && next_i!=prev_i+1)
 876                 if (++warn<5)
 877                   {
 878                     cerr<<endl<<"WARNING: in HMM "<<name<<" state "<<prev_i<<" is followed by state "<<next_i<<"\n";
 879                     if (warn==5) cerr<<endl<<"WARNING: further warnings while reading HMMs will be suppressed.\n";
 880                   }
 881               if (i>L)
 882                 {
 883                   cerr<<endl<<"Error: in HMM "<<name<<" there are more columns than the stated length "<<L<<"\n";
 884                   return 2;
 885                 }
 886               if (i>L && v)
 887                 cerr<<endl<<"WARNING: in HMM "<<name<<" there are more columns than the stated length "<<L<<"\n";
 888               if (i>=/*MAXRES*/par.maxResLen-2)
 889                 {
 890                   fgetline(line,LINELEN-1,dbf); // Skip two lines
 891                   fgetline(line,LINELEN-1,dbf);
 892                   continue;
 893                 }
 894
 895               for (a=0; (a<iAlpha) && ptr; ++a){ /* FIXME: FS introduced iAlpha, was '20' */
 896               f[i][s2a[a]] = (float) pb[s2a[a]]*fpow2(float(strinta(ptr,-99999))/HMMSCALE);
 897               //s2a[a]: transform amino acids Sorted by alphabet -> internal numbers for amino acids
 898               }
 899               for (a = iAlpha; a < 20; a++){
 900               f[i][s2a[a]] = 0.0;
 901               }
 902               if (!ptr) return Warning(dbf,line,name);
 903               if (v>=4)
 904               {
 905                   printf("%6i ",i);
 906                   for (a=0; a<iAlpha; ++a) { /* FIXME: FS introduced iAlpha, was '20' */
 907                       printf("%5.1f ",100*f[i][s2a[a]]);
 908                   }
 909                   printf("\n");
 910               }
 911
 912               // Read insert emission line
 913               fgetline(line,LINELEN-1,dbf);
 914               ptr = strscn(line);
 915               if (!ptr) return Warning(dbf,line,name);
 916               annotchr[i]=uprchr(*ptr);
 917               if (*ptr!='-' && *ptr!=' ') annot=1;
 918
 919               // Read annotation character and seven transition probabilities
 920               fgetline(line,LINELEN-1,dbf);
 921               ptr = strscn(line);
 922               switch (*ptr)
 923                 {
 924                 case 'H':
 925                   ss_dssp[i]=1;
 926                   seq[nss_dssp][i]=*ptr;
 927                   dssp=1;
 928                   break;
 929                 case 'E':
 930                   ss_dssp[i]=2;
 931                   seq[nss_dssp][i]=*ptr;
 932                   dssp=1;
 933                   break;
 934                 case 'C':
 935                   ss_dssp[i]=3;
 936                   seq[nss_dssp][i]=*ptr;
 937                   dssp=1;
 938                   break;
 939                 case 'S':
 940                   ss_dssp[i]=4;
 941                   seq[nss_dssp][i]=*ptr;
 942                   dssp=1;
 943                   break;
 944                 case 'T':
 945                   ss_dssp[i]=5;
 946                   seq[nss_dssp][i]=*ptr;
 947                   dssp=1;
 948                   break;
 949                 case 'G':
 950                   ss_dssp[i]=6;
 951                   seq[nss_dssp][i]=*ptr;
 952                   dssp=1;
 953                   break;
 954                 case 'B':
 955                   ss_dssp[i]=7;
 956                   seq[nss_dssp][i]=*ptr;
 957                   dssp=1;
 958                   break;
 959                 case 'I':
 960                   dssp=1;
 961                 case '~':
 962                   ss_dssp[i]=3;
 963                   seq[nss_dssp][i]=*ptr;
 964                   break;
 965                 case '-':
 966                 default:
 967                   ss_dssp[i]=0;
 968                   seq[nss_dssp][i]=*ptr;
 969                   break;
 970
 971                 }
 972
 973               ptr+=2;
 974               for (a=0; a<=D2D && ptr; ++a)
 975                 tr[i][a] = float(strinta(ptr,-99999))/HMMSCALE; //store transition prob's as log2-values
 976               if (!ptr) return Warning(dbf,line,name);
 977               if (v>=4)
 978                 {
 979                   printf("       ");
 980                   for (a=0; a<=D2D; ++a) printf("%5.1f ",100*fpow2(tr[i][a]));
 981                   printf("\n");
 982                 }
 983             }
 984
 985           if (line[0]=='/' && line[1]=='/') break;
 986
 987         } /* strncmp("HMM") */
 988
 989     } //while(getline)
 990
 991   if (L==0) return 0; //End of db file -> stop reading in
 992
 993   // Set coefficients of EVD (= 0.0 if not calibrated for these parameters)
 994   //   lamda = lamda_hash.Show(par.Key());
 995   //   mu    = mu_hash.Show(par.Key());
 996   if (lamda && v>=2) printf("HMM %s is already calibrated: lamda=%-5.3f, mu=%-5.2f\n",name,lamda,mu);
 997
 998   if (v && i!=L) cerr<<endl<<"Warning: in HMM "<<name<<" there are only "<<i<<" columns while the stated length is "<<L<<"\n";
 999   if (v && i>=/*MAXRES*/par.maxResLen-2) {i=/*MAXRES*/par.maxResLen-2; cerr<<endl<<"WARNING: maximum number "<</*MAXRES*/par.maxResLen-2<<" of residues exceeded while reading HMM "<<name<<"\n";}
1000   if (v && !i)  cerr<<endl<<"WARNING: HMM "<<name<<" contains no match states. Check the alignment that gave rise to this HMM.\n";
1001   L = i;
1002
1003   if (strlen(longname)>0) strcat(longname," ");
1004   strncat(longname,name,DESCLEN-strlen(longname)-1);  // longname = ACC NAME DESC
1005   if (strlen(name)>0) strcat(longname," ");
1006   strncat(longname,desc,DESCLEN-strlen(longname)-1);
1007   longname[DESCLEN-1]='\0';
1008   ScopID(cl,fold,sfam,fam);// get scop classification from basename (e.g. a.1.2.3.4)
1009   RemoveExtension(file,filestr); // copy name of dbfile without extension into 'file'
1010
1011   // Secondary structure
1012   if (!dssp)
1013     {
1014       // remove dssp sequence
1015       // memory that had been allocated in case ss_dssp was given needs to be freed
1016       delete[] seq[nss_dssp];    seq[nss_dssp] = NULL;
1017       // memory that had been allocated in case ss_dssp was given needs to be freed
1018       delete[] sname[nss_dssp];    sname[nss_dssp] = NULL;
1019       nss_dssp=-1;
1020       k--;
1021     }
1022   if (nss_pred>=0)
1023     {
1024       for (i=1; i<=L; ++i) ss_pred[i] = ss2i(seq[nss_pred][i]);
1025       if (nss_conf>=0)
1026         for (i=1; i<=L; ++i) ss_conf[i] = cf2i(seq[nss_conf][i]);
1027       else
1028         for (i=1; i<=L; ++i) ss_conf[i] = 5;
1029     }
1030
1031   // Copy query (first sequence) and consensus  residues?
1032   if (par.showcons)
1033     {
1034       sname[k]=new(char[10]);
1035       strcpy(sname[k],"Consensus");
1036       sname[k+1]=new char[strlen(longname)+1];
1037       strcpy(sname[k+1],longname);
1038       seq[k]=new char[L+2];
1039       seq[k][0]=' ';
1040       seq[k][L+1]='\0';
1041       seq[k+1]=new char[L+2];
1042       seq[k+1][0]=' ';
1043       seq[k+1][L+1]='\0';
1044       for (i=1; i<=L; ++i)
1045         {
1046           float pmax=0.0;
1047           int amax=0;
1048           for (a=0; a<NAA; ++a)
1049             if (f[i][a]>pmax) {amax=a; pmax=f[i][a];}
1050           if (pmax>0.6) seq[k][i]=i2aa(amax);
1051           else if (pmax>0.4) seq[k][i]=lwrchr(i2aa(amax));
1052           else seq[k][i]='x';
1053           seq[k+1][i]=i2aa(amax);
1054         }
1055       ncons=k++; // nfirst is set later!
1056     }
1057   else
1058     {
1059       sname[k]=new char[strlen(longname)+1];
1060       /* FIXME valgrind says bytes get lost here during hmm iteration --
1061          fixed in HMM::ClobberGlobal(), I (FS) think */
1062       strcpy(sname[k],longname);
1063       seq[k]=new char[L+2];
1064       seq[k][0]=' ';
1065       seq[k][L+1]='\0';
1066     }
1067
1068   if (annot) // read in some annotation characters?
1069     {
1070       annotchr[0]=' ';
1071       annotchr[L+1]='\0';
1072       strcpy(seq[k],annotchr); // overwrite the consensus sequence with the annotation characters
1073     }
1074   else if (!par.showcons)  // we have not yet calculated the consensus, but we need it now as query (first sequence)
1075     {
1076       /* FIXME: FS set ncons=k
1077          don't understand why it is not set but seem to need it */
1078       ncons = k;
1079       for (i=1; i<=L; ++i)
1080         {
1081           float pmax=0.0;
1082           int amax=0;
1083           for (a=0; a<NAA; ++a)
1084             if (f[i][a]>pmax) {amax=a; pmax=f[i][a];}
1085           seq[k][i]=i2aa(amax);
1086         }
1087     }
1088 //   printf("%i query name=%s  seq=%s\n",n,sname[n],seq[n]);
1089   nfirst=k++;
1090
1091   n_display=k;
1092
1093   // Calculate overall Neff_HMM
1094   Neff_HMM=0;
1095   for (i=1; i<=L; ++i)
1096     {
1097       float S=0.0;
1098       for (a=0; a<iAlpha; ++a) { /* FIXME: FS introduced iAlpha, was '20' */
1099         if (f[i][a]>1E-10) S-=f[i][a]*fast_log2(f[i][a]);
1100       }
1101       Neff_HMM+=(float) fpow2(S);
1102     }
1103   Neff_HMM/=L;
1104   for (i=0; i<=L; ++i) Neff_M[i] = Neff_I[i] = Neff_D[i] = 10.0; // to add only little additional pseudocounts!
1105   if (v>=2)
1106     cout<<"Read in HMM "<<name<<" with "<<L<<" match states and effective number of sequences = "<<Neff_HMM<<"\n";
1107
1108   // Set emission probabilities of zero'th (begin) state and L+1st (end) state to background probabilities
1109   for (a=0; a<iAlpha; ++a) { /* FIXME: FS introduced iAlpha, was '20' */
1110     f[0][a]=f[L+1][a]=pb[a];
1111   }
1112   delete[] annotchr;    annotchr = NULL;
1113
1114   return 1; //return status: ok
1115
1116 } /* this is the end of HMM::ReadHMMer() */
1117
1118
1119
1120 /////////////////////////////////////////////////////////////////////////////////////
1121 /**
1122  * @brief Read an HMM from a HMMER3 .hmm file; return 0 at end of file
1123  */
1124 int
1125 HMM::ReadHMMer3(FILE* dbf, char* filestr)
1126 {
1127     char line[LINELEN]="";    // input line
1128     char desc[DESCLEN]="";    // description of family
1129     char str4[5]="";          // first 4 letters of input line
1130     char* ptr;                // pointer for string manipulation
1131     int i=0;                  // index for match state (first=1)
1132     int a;                    // amino acid index
1133     char dssp=0;              // 1 if a consensus SS has been found in the transition prob lines
1134     char annot=0;             // 1 if at least one annotation character in insert lines is ne '-' or ' '
1135     int k=0;                  // index for seq[k]
1136     char* annotchr;           // consensus amino acids in ASCII format, or, in HMMER format, the reference annotation character in insert line
1137     //annotchr = new char[MAXRES]; // consensus amino acids in ASCII format, or, in HMMER format, the reference annotation character in insert line
1138     annotchr = new char[par.maxResLen]; // consensus amino acids in ASCII format, or, in HMMER format, the reference annotation character in insert line
1139     static int warn=0;
1140     int iAlpha = 20; /* size of alphabet, default is protein = 20 */
1141     double dAlphaInv = 1.00 / (double)(iAlpha); /* weight of AA */
1142
1143  trans_lin=0;
1144  L=0;
1145  Neff_HMM=0;
1146  n_seqs=n_display=N_in=N_filtered=0;
1147  nss_dssp=nsa_dssp=nss_pred=nss_conf=nfirst=ncons=-1;
1148  lamda=mu=0.0;
1149  trans_lin=0; // transition probs in log space
1150  name[0]=longname[0]=desc[0]=fam[0]='\0';
1151  //If at the end of while-loop L is still 0 then we have reached end of db file
1152
1153  // Do not delete name and seq vectors because their adresses are transferred to hitlist as part of a hit!!
1154
1155
1156  while (fgetline(line,LINELEN-1,dbf) && !(line[0]=='/' && line[1]=='/'))
1157      {
1158
1159          if (strscn(line)==NULL) continue;   // skip lines that contain only white space
1160          if (!strncmp("HMMER",line,5)) continue;
1161
1162          substr(str4,line,0,3);              // copy the first four characters into str4
1163
1164          if (!strcmp("NAME",str4) && name[0]=='\0')
1165              {
1166                  ptr=strscn(line+4);             // advance to first non-white-space character
1167                  strncpy(name,ptr,NAMELEN-1);    // copy full name to name
1168                  strcut(name);                   // ...cut after first word...
1169                  if (v>=4) cout<<"Reading in HMM "<<name<<":\n";
1170              }
1171
1172          else if (!strcmp("ACC ",str4))
1173              {
1174                  ptr=strscn(line+4);              // advance to first non-white-space character
1175                  strncpy(longname,ptr,DESCLEN-1); // copy Accession id to longname...
1176              }
1177
1178          else if (!strcmp("DESC",str4))
1179              {
1180                  ptr=strscn(line+4);             // advance to first non-white-space character
1181                  if (ptr)
1182                      {
1183                          strncpy(desc,ptr,DESCLEN-1);   // copy description to name...
1184                          desc[DESCLEN-1]='\0';
1185                          strcut(ptr);                   // ...cut after first word...
1186                      }
1187                  if (!ptr || ptr[1]!='.' || strchr(ptr+3,'.')==NULL) strcpy(fam,""); else strcpy(fam,ptr); // could not find two '.' in name?
1188              }
1189
1190          else if (!strcmp("LENG",str4))
1191              {
1192                  ptr=line+4;
1193                  L=strint(ptr);                  //read next integer (number of match states)
1194              }
1195
1196          else if (!strcmp("ALPH",str4)) {
1197
1198              ptr=strscn(line+4);
1199
1200              if (0 == strcmp(ptr, "amino")){
1201                  iAlpha = 20;
1202              }
1203              else if (0 == strcmp(ptr, "Nucleic")){
1204                  iAlpha = 4;
1205                  printf("%s:%s:%d: WARNING: HMM reading does not work for DNA/RNA\n",
1206                         __FUNCTION__, __FILE__, __LINE__);
1207              }
1208              else {
1209                  return Warning(dbf,line,name);
1210              }
1211              dAlphaInv = 1.00 / (double)(iAlpha);
1212              //continue;
1213          }
1214          else if (!strcmp("RF  ",str4)) continue;
1215          else if (!strcmp("CS  ",str4)) continue;
1216          else if (!strcmp("MAP ",str4)) continue;
1217          else if (!strcmp("COM ",str4)) continue;
1218          else if (!strcmp("NSEQ",str4))
1219              {
1220                  ptr=line+4;
1221                  N_in=N_filtered=strint(ptr);    //read next integer: number of sequences after filtering
1222              }
1223
1224          else if (!strcmp("DATE",str4)) continue;
1225          else if (!strncmp("CKSUM ",line,5)) continue;
1226          else if (!strcmp("GA  ",str4)) continue;
1227          else if (!strcmp("TC  ",str4)) continue;
1228          else if (!strcmp("NC  ",str4)) continue;
1229
1230          //////////////////////////////////////////////////////////////////////////////////////////////////////
1231          // Still needed???
1232
1233          else if (!strncmp("SADSS",line,5))
1234              {
1235                  if (nsa_dssp<0)
1236                      {
1237                          nsa_dssp=k++;
1238                          seq[nsa_dssp] = new char[/*MAXRES*/par.maxResLen+2];
1239                          sname[nsa_dssp] = new(char[15]);
1240                          strcpy(seq[nsa_dssp]," ");
1241                          strcpy(sname[nsa_dssp],"sa_dssp");
1242
1243                      }
1244                  ptr=strscn(line+5);
1245                  if (ptr)
1246                      {
1247                          strcut(ptr);
1248                          if (strlen(seq[nsa_dssp])+strlen(ptr)>=(unsigned)(/*MAXRES*/par.maxResLen))
1249                              printf("\nWARNING: HMM %s has SADSS records with more than %i residues.\n",name,/*MAXRES*/par.maxResLen);
1250                          else strcat(seq[nsa_dssp],ptr);
1251                      }
1252              }
1253
1254          else if (!strncmp("SSPRD",line,5))
1255              {
1256                  if (nss_pred<0)
1257                      {
1258                          nss_pred=k++;
1259                          seq[nss_pred] = new char[/*MAXRES*/par.maxResLen+2];
1260                          sname[nss_pred] = new(char[15]);
1261                          strcpy(seq[nss_pred]," ");
1262                          strcpy(sname[nss_pred],"ss_pred");
1263
1264                      }
1265                  ptr=strscn(line+5);
1266                  if (ptr)
1267                      {
1268                          strcut(ptr);
1269                          if (strlen(seq[nss_pred])+strlen(ptr)>=(unsigned)(/*MAXRES*/par.maxResLen))
1270                              printf("\nWARNING: HMM %s has SSPRD records with more than %i residues.\n",name,/*MAXRES*/par.maxResLen);
1271                          else strcat(seq[nss_pred],ptr);
1272                      }
1273              }
1274
1275          else if (!strncmp("SSCON",line,5))
1276              {
1277                  if (nss_conf<0)
1278                      {
1279                          nss_conf=k++;
1280                          seq[nss_conf] = new char[/*MAXRES*/par.maxResLen+2];
1281                          sname[nss_conf] = new(char[15]);
1282                          strcpy(seq[nss_conf]," ");
1283                          strcpy(sname[nss_conf],"ss_conf");
1284                      }
1285                  ptr=strscn(line+5);
1286                  if (ptr)
1287                      {
1288                          strcut(ptr);
1289                          if (strlen(seq[nss_conf])+strlen(ptr)>=(unsigned)(/*MAXRES*/par.maxResLen))
1290                              printf("\nWARNING: HMM %s has SSPRD records with more than %i residues.\n",name,/*MAXRES*/par.maxResLen);
1291                          else strcat(seq[nss_conf],ptr);
1292                      }
1293              }
1294
1295          else if (!strncmp("SSCIT",line,5)) continue;
1296          else if (!strcmp("XT  ",str4)) continue;
1297          //////////////////////////////////////////////////////////////////////////////////////////////////////
1298          else if (!strncmp("STATS LOCAL",line,11)) continue;
1299
1300          else if (!strcmp("EFFN",str4))
1301              {
1302                  ptr=line+4;
1303                  float effn = strflt(ptr);
1304                  // Calculate Neff_HMM by using f(x) = ax^0.1 + bx^0.5 + cx + d  (fitted with scop25 dataset)
1305                  Neff_HMM = -1.403534 * pow(effn, 0.1) + 4.428118 * pow(effn, 0.5) - 0.2885410 * effn - 1.108568;
1306              }
1307
1308          /////////////////////////////////////////////////////////////////////////////////////
1309          // Read transition probabilities from start state
1310          else if (!strncmp("HMM",line,3))
1311              {
1312                  fgetline(line,LINELEN-1,dbf); // Skip line with amino acid labels
1313                  fgetline(line,LINELEN-1,dbf); // Skip line with transition labels
1314                  ptr=strscn(line);
1315
1316                  if (!strncmp("COMPO",ptr,5))
1317                      {
1318                          ptr=ptr+5;
1319                          for (a=0; a<20 && ptr; ++a)
1320                              //s2a[a]: transform amino acids Sorted by alphabet -> internal numbers for amino acids
1321                              pb[s2a[a]] = (float) exp(-1.0*strflta(ptr,99999));
1322                          if (!ptr) return Warning(dbf,line,name);
1323                          if (v>=4)
1324                              {
1325                                  printf("\nNULL ");
1326                                  for (a=0; a<20; ++a) printf("%6.3g ",100.*pb[s2a[a]]);
1327                                  printf("\n");
1328                              }
1329                          fgetline(line,LINELEN-1,dbf); // Read next line
1330                      }
1331
1332                  fgetline(line,LINELEN-1,dbf); // Skip line with 0-states insert probabilities
1333
1334                  ptr = strscn(line);
1335                  for (a=0; a<=D2D && ptr; ++a)
1336                      tr[0][a] = Log2((float) exp(-1.0*strflta(ptr,99999))); //store transition probabilites as log2 values
1337                  // strinta returns next integer in string and puts ptr to first char
1338                  // after the integer. Returns -99999 if '*' is found.
1339                  // ptr is set to 0 if no integer is found after ptr.
1340                  if (!ptr) return Warning(dbf,line,name);
1341                  if (v>=4)
1342                      {
1343                          printf("       ");
1344                          for (a=0; a<=D2D && ptr; ++a) printf("%6.3g ",100*fpow2(tr[i][a]));
1345                          printf("\n");
1346                      }
1347
1348                  // Prepare to store DSSP states (if there are none, delete afterwards)
1349                  nss_dssp=k++;
1350                  seq[nss_dssp] = new char[/*MAXRES*/par.maxResLen+2];
1351                  sname[nss_dssp] = new(char[15]);
1352                  strcpy(sname[nss_dssp],"ss_dssp");
1353
1354                  /////////////////////////////////////////////////////////////////////////////////////
1355                  // Read columns of HMM
1356                  int next_i=0;  // index of next column
1357                  while (fgetline(line,LINELEN-1,dbf) &&  !(line[0]=='/' && line[1]=='/') && line[0]!='#')
1358                      {
1359                          if (strscn(line)==NULL) continue; // skip lines that contain only white space
1360
1361                          // Read in AA probabilities
1362                          ptr=line;
1363                          int prev_i = next_i;
1364                          next_i = strint(ptr); ++i;
1365                          if (v && next_i!=prev_i+1)
1366                              if (++warn<5)
1367                                  {
1368                                      cerr<<endl<<"WARNING: in HMM "<<name<<" state "<<prev_i<<" is followed by state "<<next_i<<"\n";
1369                                      if (warn==5) cerr<<endl<<"WARNING: further warnings while reading HMMs will be suppressed.\n";
1370                                  }
1371                          if (i>L)
1372                              {
1373                                  cerr<<endl<<"Error: in HMM "<<name<<" there are more columns than the stated length "<<L<<"\n";
1374                                  return 2;
1375                              }
1376                          if (i>L && v)
1377                              cerr<<endl<<"WARNING: in HMM "<<name<<" there are more columns than the stated length "<<L<<"\n";
1378                          if (i>=/*MAXRES*/par.maxResLen-2)
1379                              {
1380                                  fgetline(line,LINELEN-1,dbf); // Skip two lines
1381                                  fgetline(line,LINELEN-1,dbf);
1382                                  continue;
1383                              }
1384
1385                          for (a=0; a<iAlpha && ptr; ++a){ /* FIXME: FS introduced iAlpha, was '20' */
1386                              f[i][s2a[a]] = (float) exp(-1.0*strflta(ptr,99999));
1387                              //s2a[a]: transform amino acids Sorted by alphabet -> internal numbers for amino acids
1388                          }
1389                          for (a = iAlpha; a < 20; a++){
1390                              f[i][s2a[a]] = 0.0;
1391                          }
1392                          if (!ptr) return Warning(dbf,line,name);
1393                          if (v>=4)
1394                              {
1395                                  printf("%6i ",i);
1396                                  for (a=0; a<iAlpha; ++a) printf("%6.3g ",100*f[i][s2a[a]]);
1397                                  printf("\n");
1398                              }
1399
1400                          // Ignore MAP annotation
1401                          ptr = strscn(line); //find next word
1402                          ptr = strscn_ws(line); // ignore word
1403
1404                          // Read RF and CS annotation
1405                          ptr = strscn(line);
1406                          if (!ptr) return Warning(dbf,line,name);
1407                          annotchr[i]=uprchr(*ptr);
1408                          if (*ptr!='-' && *ptr!=' ') annot=1;
1409
1410                          ptr = strscn(line);
1411                          switch (*ptr)
1412                              {
1413                              case 'H':
1414                                  ss_dssp[i]=1;
1415                                  seq[nss_dssp][i]=*ptr;
1416                                  dssp=1;
1417                                  break;
1418                              case 'E':
1419                                  ss_dssp[i]=2;
1420                                  seq[nss_dssp][i]=*ptr;
1421                                  dssp=1;
1422                                  break;
1423                              case 'C':
1424                                  ss_dssp[i]=3;
1425                                  seq[nss_dssp][i]=*ptr;
1426                                  dssp=1;
1427                                  break;
1428                              case 'S':
1429                                  ss_dssp[i]=4;
1430                                  seq[nss_dssp][i]=*ptr;
1431                                  dssp=1;
1432                                  break;
1433                              case 'T':
1434                                  ss_dssp[i]=5;
1435                                  seq[nss_dssp][i]=*ptr;
1436                                  dssp=1;
1437                                  break;
1438                              case 'G':
1439                                  ss_dssp[i]=6;
1440                                  seq[nss_dssp][i]=*ptr;
1441                                  dssp=1;
1442                                  break;
1443                              case 'B':
1444                                  ss_dssp[i]=7;
1445                                  seq[nss_dssp][i]=*ptr;
1446                                  dssp=1;
1447                                  break;
1448                              case 'I':
1449                                  dssp=1;
1450                              case '~':
1451                                  ss_dssp[i]=3;
1452                                  seq[nss_dssp][i]=*ptr;
1453                                  break;
1454                              case '-': // no SS available from any template
1455                              case '.': // no clear consensus SS structure
1456                              case 'X': // no clear consensus SS structure
1457                                  ss_dssp[i]=0;
1458                                  seq[nss_dssp][i]='-';
1459                                  break;
1460                              default:
1461                                  ss_dssp[i]=0;
1462                                  seq[nss_dssp][i]=*ptr;
1463                                  break;
1464                              }
1465
1466                          // Read insert emission line
1467                          fgetline(line,LINELEN-1,dbf);
1468
1469                          // Read seven transition probabilities
1470                          fgetline(line,LINELEN-1,dbf);
1471
1472                          ptr+=2;
1473                          for (a=0; a<=D2D && ptr; ++a)
1474                              tr[i][a] = Log2((float) exp(-1.0*strflta(ptr,99999))); //store transition prob's as log2-values
1475                          if (!ptr) return Warning(dbf,line,name);
1476                          if (v>=4)
1477                              {
1478                                  printf("       ");
1479                                  for (a=0; a<=D2D; ++a) printf("%6.3g ",100*fpow2(tr[i][a]));
1480                                  printf("\n");
1481                              }
1482                      }
1483
1484                  if (line[0]=='/' && line[1]=='/') break;
1485
1486              } /* strncmp("HMM") */
1487
1488      } //while(getline)
1489
1490  if (L==0) return 0; //End of db file -> stop reading in
1491
1492  if (v && i!=L) cerr<<endl<<"Warning: in HMM "<<name<<" there are only "<<i<<" columns while the stated length is "<<L<<"\n";
1493  if (v && i>=/*MAXRES*/par.maxResLen-2) {i=/*MAXRES*/par.maxResLen-2; cerr<<endl<<"WARNING: maximum number "<</*MAXRES*/par.maxResLen-2<<" of residues exceeded while reading HMM "<<name<<"\n";}
1494  if (v && !i)  cerr<<endl<<"WARNING: HMM "<<name<<" contains no match states. Check the alignment that gave rise to this HMM.\n";
1495  L = i;
1496
1497  if (strlen(longname)>0) strcat(longname," ");
1498  strncat(longname,name,DESCLEN-strlen(longname)-1);  // longname = ACC NAME DESC
1499  if (strlen(name)>0) strcat(longname," ");
1500  strncat(longname,desc,DESCLEN-strlen(longname)-1);
1501  longname[DESCLEN-1]='\0';
1502  ScopID(cl,fold,sfam,fam);// get scop classification from basename (e.g. a.1.2.3.4)
1503  RemoveExtension(file,filestr); // copy name of dbfile without extension into 'file'
1504
1505  // Secondary structure
1506  if (!dssp)
1507      {
1508          // remove dssp sequence
1509          delete[] seq[nss_dssp];    // memory that had been allocated in case ss_dssp was given needs to be freed
1510          delete[] sname[nss_dssp];  // memory that had been allocated in case ss_dssp was given needs to be freed
1511          nss_dssp=-1;
1512          k--;
1513      }
1514  else { seq[nss_dssp][0]='-'; seq[nss_dssp][L+1]='\0'; }
1515
1516  if (nss_pred>=0)
1517      {
1518          for (i=1; i<=L; ++i) ss_pred[i] = ss2i(seq[nss_pred][i]);
1519          if (nss_conf>=0)
1520              for (i=1; i<=L; ++i) ss_conf[i] = cf2i(seq[nss_conf][i]);
1521          else
1522              for (i=1; i<=L; ++i) ss_conf[i] = 5;
1523      }
1524
1525  // Copy query (first sequence) and consensus  residues?
1526  if (par.showcons)
1527      {
1528          sname[k]=new(char[10]);
1529          strcpy(sname[k],"Consensus");
1530          sname[k+1]=new char[strlen(longname)+1];
1531          strcpy(sname[k+1],longname);
1532          seq[k]=new char[L+2];
1533          seq[k][0]=' ';
1534          seq[k][L+1]='\0';
1535          seq[k+1]=new char[L+2];
1536          seq[k+1][0]=' ';
1537          seq[k+1][L+1]='\0';
1538          for (i=1; i<=L; ++i)
1539              {
1540                  float pmax=0.0;
1541                  int amax=0;
1542                  for (a=0; a<NAA; ++a)
1543                      if (f[i][a]>pmax) {amax=a; pmax=f[i][a];}
1544                  if (pmax>0.6) seq[k][i]=i2aa(amax);
1545                  else if (pmax>0.4) seq[k][i]=lwrchr(i2aa(amax));
1546                  else seq[k][i]='x';
1547                  seq[k+1][i]=i2aa(amax);
1548              }
1549          ncons=k++; // nfirst is set later!
1550      }
1551  else
1552      {
1553          sname[k]=new char[strlen(longname)+1];
1554          strcpy(sname[k],longname);
1555          seq[k]=new char[L+2];
1556          seq[k][0]=' ';
1557          seq[k][L+1]='\0';
1558      }
1559
1560  if (annot) // read in some annotation characters?
1561      {
1562          annotchr[0]=' ';
1563          annotchr[L+1]='\0';
1564          strcpy(seq[k],annotchr); // overwrite the consensus sequence with the annotation characters
1565      }
1566  else if (!par.showcons)  // we have not yet calculated the consensus, but we need it now as query (first sequence)
1567      {
1568          for (i=1; i<=L; ++i)
1569              {
1570                  float pmax=0.0;
1571                  int amax=0;
1572                  for (a=0; a<NAA; ++a)
1573                      if (f[i][a]>pmax) {amax=a; pmax=f[i][a];}
1574                  seq[k][i]=i2aa(amax);
1575              }
1576      }
1577  //   printf("%i query name=%s  seq=%s\n",n,sname[n],seq[n]);
1578  nfirst=k++;
1579
1580  n_display=k;
1581  n_seqs=k;
1582
1583  // If no effektive number of sequences is given, calculate Neff_HMM by given profile
1584  if (Neff_HMM == 0) {
1585      for (i=1; i<=L; ++i)
1586          {
1587              float S=0.0;
1588              for (a=0; a<20; ++a)
1589                  if (f[i][a]>1E-10) S-=f[i][a]*fast_log2(f[i][a]);
1590              Neff_HMM+=(float) fpow2(S);
1591          }
1592      Neff_HMM/=L;
1593  }
1594
1595  for (i=0; i<=L; ++i) Neff_M[i] = Neff_I[i] = Neff_D[i] = 10.0; // to add only little additional pseudocounts!
1596  Neff_M[L+1]=1.0f;
1597  Neff_I[L+1]=Neff_D[L+1]=0.0f;
1598
1599  if (v>=2)
1600      cout<<"Read in HMM "<<name<<" with "<<L<<" match states and effective number of sequences = "<<Neff_HMM<<"\n";
1601
1602  ///////////////////////////////////////////////////////////////////
1603
1604  // Set emission probabilities of zero'th (begin) state and L+1st (end) state to background probabilities
1605  for (a=0; a<20; ++a) f[0][a]=f[L+1][a]=pb[a];
1606  delete[] annotchr;
1607
1608  has_pseudocounts=true;
1609
1610  return 1; //return status: ok
1611
1612
1613 } /* this is the end of HMM::ReadHMMer3() */
1614
1615
1616 //////////////////////////////////////////////////////////////////////////////
1617 /**
1618  * @brief Add transition pseudocounts to HMM (and calculate lin-space transition probs)
1619  */
1620 void
1621 HMM::AddTransitionPseudocounts(float gapd, float gape, float gapf, float gapg, float gaph, float gapi, float gapb)
1622 {
1623   int i;               //position in alignment
1624   float sum;
1625   float pM2M, pM2I, pM2D, pI2I, pI2M, pD2D, pD2M;
1626   float p0,p1,p2;
1627   if (par.gapb<=0) return;
1628   if (trans_lin==1) {fprintf(stderr,"Error: Adding transition pseudocounts to linear representation of %s not allowed. Please report this error to the HHsearch developers.\n",name); exit(6);}
1629   if (trans_lin==2) {fprintf(stderr,"Error: Adding transition pseudocounts twice is %s not allowed. Please report this error to the HHsearch developers.\n",name); exit(6);}
1630   trans_lin=2;
1631
1632   // Calculate pseudocount transition probabilities
1633   pM2D=pM2I=gapd*0.0286;     //a-priori probability for inserts and deletions
1634   pM2M=1-pM2D-pM2I;
1635   // gape=0 -> pI2I=0   gape=1 -> pI2I=0.75    gape=inf -> pI2I=1.
1636   pI2I=1.0*gape/(gape-1+1.0/0.75);
1637   pI2M=1-pI2I;
1638   // gape=0 -> pD2D=0   gape=1 -> pD2D=0.75    gape=inf -> pD2D=1.
1639   pD2D=1.0*gape/(gape-1+1.0/0.75);
1640   pD2M=1-pD2D;
1641
1642   for (i=0; i<=L; ++i) //for all columns in HMM
1643     {
1644       // Transitions from M state
1645       p0 = (Neff_M[i]-1)*fpow2(tr[i][M2M]) + gapb*pM2M;
1646       p1 = (Neff_M[i]-1)*fpow2(tr[i][M2D]) + gapb*pM2D;
1647       p2 = (Neff_M[i]-1)*fpow2(tr[i][M2I]) + gapb*pM2I;
1648       if (i==0) p1=p2=0;       //from M(0) no transition to D(1) and I(0) possible
1649       if (i==L) p1=p2=0;       //from M(L) no transition to D(L+1) and I(L+1) possible
1650       sum = p0+p1+p2+FLT_MIN;
1651
1652 //       p0 = p0/sum ;
1653 //       p1 = pow(p1/sum,gapf);
1654 //       p2 = pow(p2/sum,gapg);
1655 //       sum = p0+p1+p2+FLT_MIN;
1656 //       tr[i][M2M] = fast_log2(p0/sum);
1657 //       tr[i][M2D] = fast_log2(p1/sum);
1658 //       tr[i][M2I] = fast_log2(p2/sum);
1659
1660       tr[i][M2M] = fast_log2(p0/sum);
1661       tr[i][M2D] = fast_log2(p1/sum)*gapf;
1662       tr[i][M2I] = fast_log2(p2/sum)*gapg;
1663
1664       // Transitions from I state
1665       p0 = Neff_I[i]*fpow2(tr[i][I2M]) + gapb*pI2M;
1666       p1 = Neff_I[i]*fpow2(tr[i][I2I]) + gapb*pI2I;
1667       sum = p0+p1+FLT_MIN;
1668
1669 //       p0 = pow(p0/sum,gapg);
1670 //       p1 = pow(p1/sum,gapi);
1671 //       sum = p0+p1+FLT_MIN;
1672 //       tr[i][I2M] = fast_log2(p0/sum);
1673 //       tr[i][I2I] = fast_log2(p1/sum);
1674
1675       tr[i][I2M] = fast_log2(p0/sum);
1676       tr[i][I2I] = fast_log2(p1/sum)*gapi;
1677
1678       // Transitions from D state
1679       p0 = Neff_D[i]*fpow2(tr[i][D2M]) + gapb*pD2M;
1680       p1 = Neff_D[i]*fpow2(tr[i][D2D]) + gapb*pD2D;
1681       if (i==L) p1=0;          //from D(L) no transition to D(L+1) possible
1682       sum = p0+p1+FLT_MIN;
1683
1684 //       p0 = pow(p0/sum,gapf);
1685 //       p1 = pow(p1/sum,gaph);
1686 //       sum = p0+p1+FLT_MIN;
1687 //       tr[i][D2M] = fast_log2(p0/sum);
1688 //       tr[i][D2D] = fast_log2(p1/sum);
1689
1690       tr[i][D2M] = fast_log2(p0/sum);
1691       tr[i][D2D] = fast_log2(p1/sum)*gaph;
1692
1693       // SS-dependent gap penalties
1694       tr[i][M2M_GAPOPEN]=tr[i][M2M];
1695       tr[i][GAPOPEN]=0.0;
1696       tr[i][GAPEXTD]=0.0;
1697     }
1698
1699   if (v>=4)
1700     {
1701       printf("\nPseudocount transition probabilities:\n");
1702       printf("pM2M=%4.1f%%, pM2I=%4.1f%%, pM2D=%4.1f%%, ",100*pM2M,100*pM2I,100*pM2D);
1703       printf("pI2M=%4.1f%%, pI2I=%4.1f%%, ",100*pI2M,100*pI2I);
1704       printf("pD2M=%4.1f%%, pD2D=%4.1f%% ",100*pD2M,100*pD2D);
1705       printf("tau = %4.1f%%\n\n",100.*gapb/(Neff_HMM-1+gapb));
1706       printf("Listing transition probabilities WITH pseudocounts:\n");
1707       printf("   i dssp pred sacc     M->M   M->I   M->D   I->M   I->I   D->M   D->D\n");
1708
1709       for (i=1; i<=L; ++i) //for all columns in HMM
1710         {
1711           printf("%4i  %1c    %1c    %1c    %6.3f %6.3f %6.3f ",i,i2ss(ss_dssp[i]),i2ss(ss_pred[i]),i2sa(sa_dssp[i]),fpow2(tr[i][M2M]),fpow2(tr[i][M2I]),fpow2(tr[i][M2D]));
1712           printf("%6.3f %6.3f ",fpow2(tr[i][I2M]),fpow2(tr[i][I2I]));
1713           printf("%6.3f %6.3f ",fpow2(tr[i][D2M]),fpow2(tr[i][D2D]));
1714           printf("%1i %2i  %1i\n",ss_pred[i],ss_conf[i],ss_dssp[i]);
1715         }
1716       printf("\n");
1717       printf("nss_dssp=%i  nss_pred=%i\n",nss_dssp,nss_pred);
1718     }
1719   return;
1720 }
1721
1722
1723 //////////////////////////////////////////////////////////////////////////////
1724 /**
1725  * @brief Use secondary structure-dependent gap penalties
1726  *    on top of the HMM transition penalties
1727  */
1728 void
1729 HMM::UseSecStrucDependentGapPenalties()
1730 {
1731   int i;   // column in HMM
1732   int ii;
1733   //unsigned char iis[MAXRES]; // inside-integer array
1734   unsigned char iis[par.maxResLen]; // inside-integer array
1735   float d; // Additional penalty for opening gap whithin SS element
1736   float e; // Additional penalty for extending gap whithin SS element
1737
1738   // Determine inside-integers:
1739   // CCSTCCCHHHHHHHHHHHCCCCCEEEEECCSBGGGCCCCEECC
1740   // 0000000123444432100000012210000000000001000
1741   ii=0;
1742   for (i=0; i<=L; ++i) // forward run
1743     {
1744       if (ss_dssp[i]==1 || ss_dssp[i]==2) {ii+=(ii<par.ssgapi);} else ii=0;
1745       iis[i]=ii;
1746     }  for (i=0; i<=L; ++i)
1747   ii=0;
1748   iis[0]=iis[L]=0;
1749   for (i=L; i>=0; i--) // backward run
1750     {
1751       if (ss_dssp[i]==1 || ss_dssp[i]==2) {ii+=(ii<par.ssgapi);} else ii=0;
1752       iis[i-1]=imin(ii,iis[i-1]);
1753     }
1754
1755   // Add SS-dependent gap penalties to HMM transition penalties
1756   for (i=0; i<=L; ++i) //for all columns in HMM
1757     {
1758       d=-iis[i]*par.ssgapd;
1759       e=-iis[i]*par.ssgape;
1760       tr[i][GAPOPEN]=d;
1761       tr[i][GAPEXTD]=e;
1762       tr[i][M2M_GAPOPEN]+=d;
1763       tr[i][M2I]+=d;
1764       tr[i][I2M]+=d;
1765       tr[i][I2I]+=e;
1766       tr[i][M2D]+=d;
1767       tr[i][D2M]+=d;
1768       tr[i][D2D]+=e;
1769     }
1770
1771   if (v>=3)
1772     {
1773       printf("Col SS II\n");
1774       for (i=0; i<=L; ++i) printf("%3i  %c %2i\n",i,i2ss(ss_dssp[i]),iis[i]);
1775     }
1776   return;
1777 }
1778
1779 /////////////////////////////////////////////////////////////////////////////////////
1780 /**
1781  * @brief Generate an amino acid frequency matrix g[][] with full pseudocount admixture (tau=1)
1782  */
1783 void
1784 HMM::PreparePseudocounts()
1785 {
1786   for (int i=0; i<=L+1; ++i)
1787     for (int a=0; a<20; ++a)
1788       g[i][a] = // produces fast code
1789        R[a][0]*f[i][0]  +R[a][1]*f[i][1]  +R[a][2]*f[i][2]  +R[a][3]*f[i][3]  +R[a][4]*f[i][4]
1790       +R[a][5]*f[i][5]  +R[a][6]*f[i][6]  +R[a][7]*f[i][7]  +R[a][8]*f[i][8]  +R[a][9]*f[i][9]
1791       +R[a][10]*f[i][10]+R[a][11]*f[i][11]+R[a][12]*f[i][12]+R[a][13]*f[i][13]+R[a][14]*f[i][14]
1792       +R[a][15]*f[i][15]+R[a][16]*f[i][16]+R[a][17]*f[i][17]+R[a][18]*f[i][18]+R[a][19]*f[i][19];
1793 }
1794
1795 /////////////////////////////////////////////////////////////////////////////////////
1796 /**
1797  * @brief Add amino acid pseudocounts to HMM and calculate average protein aa probabilities pav[a]
1798  * Pseudocounts: t.p[i][a] = (1-tau)*f[i][a] + tau*g[i][a]
1799  */
1800 void
1801 HMM::AddAminoAcidPseudocounts(char pcm, float pca, float pcb, float pcc)
1802 {
1803   int i;               //position in HMM
1804   int a;               //amino acid (0..19)
1805   float sum;
1806   float tau;           //tau = pseudocount admixture
1807
1808   for (a=0; a<20; ++a) pav[a]=pb[a]*100.0f/Neff_HMM; // initialize vector of average aa freqs with pseudocounts
1809
1810   // Calculate amino acid frequencies p[i][a] = (1-tau(i))*f[i][a] + tau(i)*g[i][a]
1811   switch (pcm)
1812     {
1813     case 0: //no pseudocounts whatsoever: tau=0
1814       for (i=1; i<=L; ++i)
1815         for (a=0; a<20; ++a)
1816           pav[a] += ( p[i][a]=f[i][a] );
1817       break;
1818     case 1: //constant pseudocounts (for optimization): tau = pca
1819       tau = pca;
1820       for (i=1; i<=L; ++i)
1821         for (a=0; a<20; ++a)
1822           pav[a] += ( p[i][a] = (1.-tau)*f[i][a] + tau * g[i][a] );
1823       break;
1824     case 2: //divergence-dependent pseudocounts
1825     case 4: //divergence-dependent pseudocounts and rate matrix rescaling
1826       if (par.pcc==1.0f)
1827         for (i=1; i<=L; ++i)
1828           {
1829             tau = fmin(1.0, pca/(1. + Neff_M[i]/pcb ) );
1830             for (a=0; a<20; ++a)
1831               pav[a] += ( p[i][a] = (1.-tau)*f[i][a] + tau * g[i][a] );
1832           }
1833       else
1834         for (i=1; i<=L; ++i)
1835           {
1836             tau = fmin(1.0, pca/(1. + pow((Neff_M[i])/pcb,pcc)));
1837             for (a=0; a<20; ++a)
1838               pav[a] += ( p[i][a] = (1.-tau)*f[i][a] + tau * g[i][a] );
1839           }
1840       break;
1841     case 3: // constant-divergence pseudocounts
1842       for (i=1; i<=L; ++i)
1843         {
1844           float x = Neff_M[i]/pcb;
1845           pca = 0.793 + 0.048*(pcb-10.0);
1846           tau = fmax(0.0, pca*(1-x + pcc*x*(1-x)) );
1847           for (a=0; a<20; ++a)
1848             pav[a] += ( p[i][a] = (1.-tau)*f[i][a] + tau * g[i][a] );
1849         }
1850       if (v>=2) { printf("Divergence before / after addition of amino acid pseudocounts: %5.2f / %5.2f\n",Neff_HMM, CalcNeff()); }
1851      break;
1852     } //end switch (pcm)
1853
1854
1855   // Normalize vector of average aa frequencies pav[a]
1856   NormalizeTo1(pav,NAA);
1857
1858   for (a=0; a<20; ++a)
1859     p[0][a] = p[L+1][a] = pav[a];
1860
1861   // DEBUGGING output
1862   if (v>=3)
1863     {
1864       switch (pcm)
1865         {
1866         case 0:
1867           cout<<"No pseudocounts added (-pcm 0)\n";
1868           return;
1869         case 1:
1870           cout<<"Adding constant AA pseudocount admixture of "<<pca<<" to HMM "<<name<<"\n";
1871           break;
1872         case 2:
1873           cout<<"Adding divergence-dependent AA pseudocounts (-pcm 2) with admixture of "
1874               <<pca/(1.+pow((Neff_HMM-1.)/pcb,pcc))<<" to HMM "<<name<<"\n";
1875           break;
1876         } //end switch (pcm)
1877       cout<<"\nAverage amino acid frequencies WITH pseudocounts in HMM: \nProf: ";
1878       for (a=0; a<20; ++a) printf("%4.1f ",100*pav[a]);
1879       cout<<"\n";
1880       if (v>=4)
1881         {
1882           cout<<"\nAmino acid frequencies WITHOUT pseudocounts:\n       A    R    N    D    C    Q    E    G    H    I    L    K    M    F    P    S    T    W    Y    V\n";
1883           for (i=1; i<=L; ++i)
1884             {
1885               printf("%3i:  ",i);
1886               sum=0;
1887               for (a=0; a<20; ++a)
1888                 {
1889                   sum+=f[i][a];
1890                   printf("%4.1f ",100*f[i][a]);
1891                 }
1892               printf("  sum=%5.3f\n",sum);
1893             }
1894           cout<<"\nAmino acid frequencies WITH pseudocounts:\n       A    R    N    D    C    Q    E    G    H    I    L    K    M    F    P    S    T    W    Y    V\n";
1895           for (i=1; i<=L; ++i)
1896             {
1897               printf("%3i:  ",i);
1898               sum=0;
1899               for (a=0; a<20; ++a)
1900                 {
1901                   sum+=p[i][a];
1902                   printf("%4.1f ",100*p[i][a]);
1903                 }
1904               printf("  sum=%5.3f\n",sum);
1905             }
1906         }
1907    }
1908   return;
1909 }
1910
1911
1912 /////////////////////////////////////////////////////////////////////////////////////
1913 /**
1914  * @brief Factor Null model into HMM t
1915  */
1916 void
1917 HMM::IncludeNullModelInHMM(HMM& q, HMM& t)
1918 {
1919
1920   int i,j;        //query and template match state indices
1921   int a;          //amino acid index
1922
1923   switch (par.columnscore)
1924     {
1925     default:
1926     case 0: // Null model with background prob. from database
1927       for (a=0; a<20; ++a) pnul[a]=pb[a];
1928       break;
1929
1930     case 1: // Null model with background prob. equal average from query and template
1931       for (a=0; a<20; ++a) pnul[a]=0.5*(q.pav[a]+t.pav[a]);
1932       break;
1933
1934     case 2: // Null model with background prob. from template protein
1935       for (a=0; a<20; ++a) pnul[a]=t.pav[a];
1936       break;
1937
1938     case 3: // Null model with background prob. from query protein
1939       for (a=0; a<20; ++a) pnul[a]=q.pav[a];
1940       break;
1941
1942     case 4: // Null model with background prob. equal average from query and template
1943       for (a=0; a<20; ++a) pnul[a]=sqrt(q.pav[a]*t.pav[a]);
1944       break;
1945
1946    case 10: // Separated column scoring for Stochastic Backtracing (STILL USED??)
1947       for (i=0; i<=q.L+1; ++i)
1948         {
1949           float sum = 0.0;
1950           for (a=0; a<20; ++a) sum += pb[a]*q.p[i][a];
1951           sum = 1.0/sqrt(sum);
1952           for (a=0; a<20; ++a) q.p[i][a]*=sum;
1953         }
1954       for (j=0; j<=t.L+1; j++)
1955         {
1956           float sum = 0.0;
1957           for (a=0; a<20; ++a) sum += pb[a]*t.p[j][a];
1958           sum = 1.0/sqrt(sum);
1959           for (a=0; a<20; ++a) t.p[j][a]*=sum;
1960         }
1961       break;
1962
1963     case 11:  // log co-emission probability (no null model)
1964       for (a=0; a<20; ++a) pnul[a]=0.05;
1965       break;
1966
1967    }
1968
1969   // !!!!! ATTENTION!!!!!!!  after this t.p is not the same as after adding pseudocounts !!!
1970   //Introduce amino acid weights into template (for all but SOP scores)
1971   if (par.columnscore!=10)
1972     for (a=0; a<20; ++a)
1973       for (j=0; j<=t.L+1; j++)
1974         t.p[j][a]/=pnul[a];
1975
1976   if (v>=5)
1977     {
1978       cout<<"\nAverage amino acid frequencies\n";
1979       cout<<"         A    R    N    D    C    Q    E    G    H    I    L    K    M    F    P    S    T    W    Y    V\n";
1980       cout<<"Q:    ";
1981       for (a=0; a<20; ++a) printf("%4.1f ",100*q.pav[a]);
1982       cout<<"\nT:    ";
1983       for (a=0; a<20; ++a) printf("%4.1f ",100*t.pav[a]);
1984       cout<<"\nNull: ";
1985       for (a=0; a<20; ++a) printf("%4.1f ",100*pnul[a]);
1986       cout<<"\npb:   ";
1987       for (a=0; a<20; ++a) printf("%4.1f ",100*pb[a]);
1988     }
1989
1990
1991   return;
1992 }
1993
1994
1995 /////////////////////////////////////////////////////////////////////////////////////
1996 /**
1997  * @brief Write HMM to output file
1998  */
1999 void
2000 HMM::WriteToFile(char* outfile)
2001 {
2002   const int SEQLEN=100;      // number of residues per line for sequences to be displayed
2003   int i,a;
2004
2005   if (trans_lin) {fprintf(stderr,"Error: Writing transition pseudocounts in linear representation not allowed. Please report this error to the HHsearch developers.\n"); exit(6);}
2006
2007   FILE *outf=NULL;
2008   if (strcmp(outfile,"stdout"))
2009     {
2010       if (par.append) outf=fopen(outfile,"a"); else outf=fopen(outfile,"w");
2011       if (!outf) OpenFileError(outfile);
2012     }
2013   else
2014     outf = stdout;
2015   if (v>=2) cout<<"Writing HMM to "<<outfile<<"\n";
2016
2017 //   fprintf(outf,"HHsearch HHM format 1.5\n");
2018   fprintf(outf,"HHsearch 1.5\n");         // format specification
2019   fprintf(outf,"NAME  %s\n",longname);    // name of first sequence
2020   fprintf(outf,"FAM   %s\n",fam);         // family name
2021   char file_nopath[NAMELEN];
2022   RemovePath(file_nopath,file);
2023   fprintf(outf,"FILE  %s\n",file_nopath); // base name of alignment file
2024
2025   // Print command line
2026   fprintf(outf,"COM   ");
2027   for (int i=0; i<par.argc; i++)
2028     if (strlen(par.argv[i])<=100)
2029       fprintf(outf,"%s ",par.argv[i]);
2030     else
2031       fprintf(outf,"<%i characters> ",(int)strlen(par.argv[i]));
2032   fprintf(outf,"\n");
2033
2034   // print out date stamp
2035   time_t* tp=new(time_t);
2036   *tp=time(NULL);
2037   fprintf(outf,"DATE  %s",ctime(tp));
2038   delete tp; tp = NULL; /* really? FS */
2039
2040   // Print out some statistics of alignment
2041   fprintf(outf,"LENG  %i match states, %i columns in multiple alignment\n",L,l[L]);
2042   fprintf(outf,"FILT  %i out of %i sequences passed filter (-id %i -cov %i -qid %i -qsc %.2f -diff %i)\n",N_filtered,N_in,par.max_seqid,par.coverage,par.qid,par.qsc,par.Ndiff);
2043   fprintf(outf,"NEFF  %-4.1f\n",Neff_HMM);
2044
2045   // Print selected sequences from alignment (including secondary structure and confidence values, if known)
2046   fprintf(outf,"SEQ\n");
2047   for (int n=0; n<n_display; n++)
2048     {
2049       fprintf(outf,">%s\n",sname[n]);
2050       //first sequence character starts at 1; 0 not used.
2051       for(unsigned int j=0; j<strlen(seq[n]+1); j+=SEQLEN) fprintf(outf,"%-.*s\n",SEQLEN,seq[n]+1+j);
2052     }
2053   fprintf(outf,"#\n");
2054
2055   // print null model background probabilities from substitution matrix
2056   fprintf(outf,"NULL   ");
2057   for (a=0; a<20; ++a) fout(outf,-iround(fast_log2(pb[s2a[a]])*HMMSCALE ));
2058   fprintf(outf,"\n");
2059
2060   // print table header line with amino acids
2061   fprintf(outf,"HMM    ");
2062   for (a=0; a<20; ++a) fprintf(outf,"%1c\t",i2aa(s2a[a]));
2063   fprintf(outf,"\n");
2064
2065   // print table header line with state transitions
2066   fprintf(outf,"       M->M\tM->I\tM->D\tI->M\tI->I\tD->M\tD->D\tNeff\tNeff_I\tNeff_D\n");
2067
2068   // print out transition probabilities from begin state (virtual match state)
2069   fprintf(outf,"       ");
2070   for (a=0; a<=D2D; ++a) fout(outf,-iround(tr[0][a]*HMMSCALE));
2071   fout(outf,iround(Neff_M[0]*HMMSCALE));
2072   fout(outf,iround(Neff_I[0]*HMMSCALE));
2073   fout(outf,iround(Neff_D[0]*HMMSCALE));
2074   fprintf(outf,"\n");
2075
2076   // Start loop for printing HMM columns
2077   int h=1;
2078   for (i=1; i<=L; ++i)
2079     {
2080
2081       while(islower(seq[nfirst][h]) && seq[nfirst][h]) h++;
2082       fprintf(outf,"%1c %-4i ",seq[nfirst][h++],i);
2083
2084       // Print emission probabilities for match state
2085       for (a=0; a<20; ++a) fout(outf,-iround(fast_log2(p[i][s2a[a]])*HMMSCALE ));
2086       fprintf(outf,"%-i",l[i]);
2087       fprintf(outf,"\n");
2088
2089       // Print transition probabilities
2090       fprintf(outf,"       ");
2091       for (a=0; a<=D2D; ++a) fout(outf,-iround(tr[i][a]*HMMSCALE));
2092       fout(outf,iround(Neff_M[i]*HMMSCALE));
2093       fout(outf,iround(Neff_I[i]*HMMSCALE));
2094       fout(outf,iround(Neff_D[i]*HMMSCALE));
2095       fprintf(outf,"\n\n");
2096     } // end for(i)-loop for printing HMM columns
2097
2098   fprintf(outf,"//\n");
2099   fclose(outf);
2100 }
2101
2102 /////////////////////////////////////////////////////////////////////////////////////
2103 /**
2104  * @brief Write HMM to output file
2105  */
2106 void
2107 HMM::InsertCalibration(char* infile)
2108 {
2109   char* line =  new(char[LINELEN]);    // input line
2110   char** lines = new char*[3*L+100000];
2111   int nline=0;
2112   int l;
2113   char done=0;   // inserted new 'EVD mu sigma' line?
2114
2115   // Read from infile all lines and insert the EVD line with lamda and mu coefficients
2116   ifstream inf;
2117   inf.open(infile, ios::in);
2118   if (!inf) OpenFileError(infile);
2119   if (v>=2) cout<<"Recording calibration coefficients in "<<infile<<"\n";
2120
2121   while (inf.getline(line,LINELEN) && !(line[0]=='/' && line[1]=='/') && nline<2*/*MAXRES*/par.maxResLen)
2122     {
2123
2124       // Found an EVD lamda mu line? -> remove
2125       while (!done && !strncmp(line,"EVD ",3) && !(line[0]=='/' && line[1]=='/') && nline<2*/*MAXRES*/par.maxResLen)
2126         inf.getline(line,LINELEN);
2127       if ((line[0]=='/' && line[1]=='/') || nline>=2*/*MAXRES*/par.maxResLen)
2128         {fprintf(stderr,"Error: wrong format in %s. Expecting hhm format\n",infile); exit(1);}
2129
2130       // Found the SEQ line? -> insert calibration before this line
2131       if (!done && (!strncmp("SEQ",line,3) || !strncmp("HMM",line,3)) && (isspace(line[3]) || line[3]=='\0'))
2132         {
2133           done=1;
2134           lines[nline]=new(char[128]);
2135           if (!lines[nline]) MemoryError("space to read in HHM file for calibration");
2136           sprintf(lines[nline],"EVD   %-7.4f %-7.4f",lamda,mu);
2137           nline++;
2138         }
2139       lines[nline]=new char[strlen(line)+1];
2140       if (!lines[nline]) MemoryError("space to read in HHM file for calibration");
2141       strcpy (lines[nline],line);
2142       nline++;
2143     }
2144   inf.close();
2145
2146   // Write to infile all lines
2147   FILE* infout=fopen(infile,"w");
2148   if (!infout) {
2149     cerr<<endl<<"WARNING in "<<program_name<<": no calibration coefficients written to "<<infile<<":\n";
2150     cerr<<"Could not open file for writing.\n";
2151     return;
2152   }
2153   for (l=0; l<nline; l++) {
2154     fprintf(infout,"%s\n",lines[l]);
2155     delete[] lines[l];  lines[l] = NULL;
2156   }
2157   fprintf(infout,"//\n");
2158   fclose(infout);
2159   delete[] line;  line = NULL;
2160   delete[] lines;  lines = NULL;
2161   return;
2162 }
2163
2164 /////////////////////////////////////////////////////////////////////////////////////
2165 /**
2166  * @brief Write HMM to output file in HMMER format
2167  */
2168 void
2169 HMM::WriteToFileHMMER(char* outfile)
2170 {
2171   const int INTSCALE=1000;  //scaling factor in HMMER files
2172   const float pBD=0.50;
2173   const int LOG2pBD=iround(fast_log2(pBD)*INTSCALE);
2174   const int LOG2pBM=iround(fast_log2(1-pBD)*INTSCALE);
2175   const float pJB=1.0/350;
2176   const int LOG2pJB=iround(fast_log2(pJB)*INTSCALE);
2177   const int LOG2pJJ=iround(fast_log2(1-pJB)*INTSCALE);
2178   const float pEJ=0.5;
2179   const int LOG2pEJ=iround(fast_log2(pEJ)*INTSCALE);
2180   const int LOG2pEC=iround(fast_log2(1-pEJ)*INTSCALE);
2181   char c;
2182   int i,a;
2183
2184   if (trans_lin) {fprintf(stderr,"Error: Writing transition pseudocounts in linear representation not allowed. Please report this error to the HHsearch developers.\n"); exit(6);}
2185
2186   FILE *outf=NULL;
2187   if (strcmp(outfile,"stdout"))
2188     {
2189       if (par.append) outf=fopen(outfile,"a"); else outf=fopen(outfile,"w");
2190       if (!outf) OpenFileError(outfile);
2191     }
2192   else
2193     outf = stdout;
2194   if (v>=2) cout<<"Writing HMM to "<<outfile<<"\n";
2195
2196   fprintf(outf,"HMMER2.0  [hhmake %s]\n",VERSION_AND_DATE);
2197   fprintf(outf,"NAME  %s\n",file);       // base name of alignment file
2198   fprintf(outf,"DESC  %s\n",longname);
2199   fprintf(outf,"LENG  %i\n",L);
2200   fprintf(outf,"ALPH  Amino\n");         // amino acid seuqences (not DNA)
2201   fprintf(outf,"RF    yes\n");            // reference annotation flag
2202   fprintf(outf,"CS    yes\n");           // consensus structure annotation flag
2203   fprintf(outf,"MAP   yes\n");           // write MA column number after each line of aa probabilities
2204
2205   fprintf(outf,"COM   ");                // print out command line
2206   for (i=0; i<=par.argc-1; ++i) fprintf(outf,"%s ",par.argv[i]); fprintf(outf,"\n");
2207
2208   fprintf(outf,"NSEQ  %i\n",N_filtered); // print number of sequences after filtering
2209
2210   // Date stamp
2211   time_t* tp=new(time_t);
2212   *tp=time(NULL);
2213   fprintf(outf,"DATE  %s",ctime(tp));
2214   delete tp; tp = NULL; /* really? FS */
2215
2216   // Print out secondary structure
2217   if (nss_dssp>=0)
2218     fprintf(outf,"SSDSS %s\n",seq[nss_dssp]);
2219   if (nsa_dssp>=0)
2220     fprintf(outf,"SADSS %s\n",seq[nsa_dssp]);
2221   if (nss_pred>=0)
2222     fprintf(outf,"SSPRD %s\n",seq[nss_pred]);
2223   if (nss_conf>=0)
2224     fprintf(outf,"SSCNF %s\n",seq[nss_conf]);
2225
2226
2227   // Special Plan7 transitions that control repeated detection of profile HMM within sequence
2228   fprintf(outf,"XT     %6i %6i %6i %6i %6i %6i %6i %6i\n",LOG2pJB,LOG2pJJ,LOG2pEC,LOG2pEJ,LOG2pJB,LOG2pJJ,LOG2pJB,LOG2pJJ);
2229   fprintf(outf,"NULT      -4  -8455\n");
2230
2231
2232   // Null model background probabilities from substitution matrix
2233   fprintf(outf,"NULE  ");
2234   for (a=0; a<20; ++a)
2235     {
2236       float lg2=fast_log2(pb[s2a[a]]*20.0);
2237       if (lg2<-99.999) fprintf(outf,"      *"); else fprintf(outf," %6i",iround(lg2*INTSCALE));
2238     }
2239   fprintf(outf,"\n");
2240
2241   // Table header line with amino acids
2242   fprintf(outf,"HMM    ");
2243   for (a=0; a<20; ++a) fprintf(outf,"     %1c ",i2aa(s2a[a]));
2244   fprintf(outf,"\n");
2245
2246   // Table header line with state transitions
2247   fprintf(outf,"         m->m   m->i   m->d   i->m   i->i   d->m   d->d   b->m   m->e\n");
2248
2249   // Transition probabilities from begin state
2250   fprintf(outf,"       %6i      * %6i\n",LOG2pBM,LOG2pBD);
2251
2252   // Start loop for printing HMM columns
2253   int h=1, hss=1;
2254   for (i=1; i<=L; ++i)
2255     {
2256
2257       // Emission probabilities for match state
2258       fprintf(outf," %5i",i);
2259       for (a=0; a<20; ++a) fprintf(outf," %6i",imax(-9999,iround(fast_log2(p[i][s2a[a]]/pb[s2a[a]])*INTSCALE)));
2260       fprintf(outf," %5i",l[i]);
2261       fprintf(outf,"\n");
2262
2263       // Emission probabilities (relative to null model) for insert state
2264       while(islower(seq[nfirst][h]) && seq[nfirst][h]) h++;
2265       if (i==L)
2266         fprintf(outf,"     %1c      *      *      *      *      *      *      *      *      *      *      *      *      *      *      *      *      *      *      *      *\n",seq[nfirst][h++]);
2267       else
2268         fprintf(outf,"     %1c      0      0      0      0      0      0      0      0      0      0      0      0      0      0      0      0      0      0      0      0\n",seq[nfirst][h++]);
2269
2270       // Transition probabilities
2271       if (nss_dssp>=0)
2272         {
2273           while(islower(seq[nss_dssp][hss]) && seq[nss_dssp][hss]) hss++;
2274           c=seq[nss_dssp][hss++];
2275         }
2276       else c=' ';
2277       fprintf(outf,"     %1c",c);
2278       if (i==1)
2279         {
2280           for (a=0; a<=D2D; ++a) fprintf(outf," %6i",imax(-9999,iround(tr[i][a]*INTSCALE)));
2281           fprintf(outf," %6i      *\n",LOG2pBM);
2282         }
2283       else if (i==L)
2284         {
2285           for (a=0; a<=D2D; ++a) fprintf(outf,"     *");
2286           fprintf(outf,"      *      0\n");
2287         }
2288       else
2289         {
2290           for (a=0; a<=D2D; ++a) fprintf(outf," %6i",imax(-9999,iround(tr[i][a]*INTSCALE)));
2291           fprintf(outf,"      *      *\n");
2292         }
2293     }
2294
2295   fprintf(outf,"//\n");
2296   fclose(outf);
2297 }
2298
2299
2300 /////////////////////////////////////////////////////////////////////////////////////
2301 /**
2302  * @brief Transform log to lin transition probs
2303  */
2304 void
2305 HMM::Log2LinTransitionProbs(float beta)
2306 {
2307   if (trans_lin==1) return;
2308   trans_lin=1;
2309   for (int i=0; i<=L; ++i)
2310     {
2311       for (int a=0; a<NTRANS; ++a)
2312         tr[i][a] = fpow2(beta*tr[i][a]);
2313 /* FIXME valgrind says: "Conditional jump or move depends on
2314  * uninitialised value(s)" when using hmm iteration
2315  */
2316     }
2317 }
2318
2319
2320 /**
2321  * @brief Set query columns in His-tags etc to Null model distribution
2322  */
2323 void
2324 HMM::NeutralizeTags()
2325 {
2326   char* qseq = seq[nfirst];
2327   char* pt;
2328   int a,i;
2329
2330   if (NULL == qseq){
2331       return;
2332   }
2333
2334   // Neutralize His tag
2335   if ( (pt=strstr(qseq,"HHHHH")) )
2336     {
2337       int i0 = pt-qseq+1;
2338       if (v>=2) printf("Neutralized His-tag at position %i\n",i0);
2339       for (i=imax(i0-5,1); i<i0; ++i)   // neutralize leading 5 columns
2340         for (a=0; a<NAA; ++a) p[i][a]=pb[a];
2341       for (; (*pt)!='H'; ++i,++pt)      // neutralize His columns
2342         for (a=0; a<NAA; ++a) p[i][a]=pb[a];
2343       i0=i;
2344        for (; i<imin(i0+5,L+1); ++i)    // neutralize trailing 5 columns
2345         for (a=0; a<NAA; ++a) p[i][a]=pb[a];
2346        if (v>=3) printf("start:%i  end:%i\n",imax(i0-5,1),i-1);
2347     }
2348
2349   // Neutralize C-myc tag
2350   if ( (pt=strstr(qseq,"EQKLISEEDL")) )
2351     {
2352       if (v>=2) printf("Neutralized C-myc-tag at position %i\n",int(pt-qseq)+1);
2353       for (i=pt-qseq+1; i<=pt-qseq+10; ++i)
2354         for (a=0; a<NAA; ++a) p[i][a]=pb[a];
2355     }
2356   // Neutralize FLAG tag
2357   if ( (pt=strstr(qseq,"DYKDDDDK")) )
2358     {
2359       if (v>=2) printf("Neutralized FLAG-tag at position %i\n",int(pt-qseq)+1);
2360       for (i=pt-qseq+1; i<=pt-qseq+8; ++i)
2361         for (a=0; a<NAA; ++a) p[i][a]=pb[a];
2362     }
2363 }
2364
2365
2366
2367 /////////////////////////////////////////////////////////////////////////////////////
2368 /**
2369  * @brief Calculate effective number of sequences using profiles INCLUDING pseudocounts
2370  */
2371 float
2372 HMM::CalcNeff()
2373 {
2374   float Neff=0;
2375   for (int i=1; i<=L; ++i)
2376     for (int a=0; a<20; ++a)
2377       if (p[i][a]>1E-10) Neff-=p[i][a]*fast_log2(p[i][a]);
2378   return fpow2(Neff/L);
2379 }
2380
2381
2382 /////////////////////////////////////////////////////////////////////////////////////
2383 /**
2384  * @brief Calculate consensus of HMM (needed to merge HMMs later)
2385  */
2386 void
2387 HMM::CalculateConsensus()
2388 {
2389   int i;      // position in query
2390   int a;      // amino acid
2391   if (!Xcons) Xcons = new char[/*MAXRES*/par.maxResLen+2];
2392   for (i=1; i<=L; ++i)
2393     {
2394       float max=f[i][0]-pb[0];
2395       for (a=1; a<20; ++a)
2396         if (f[i][a]-pb[a]>max) Xcons[i]=a;
2397     }
2398   Xcons[0]=Xcons[L+1]=ENDGAP;
2399 }
2400
2401 // /////////////////////////////////////////////////////////////////////////////////////
2402 // // Store linear transition probabilities
2403 // /////////////////////////////////////////////////////////////////////////////////////
2404 // void HMM::StoreLinearTransitionProbs()
2405 // {
2406 //   int i;      // position in query
2407 //   for (i=0; i<=L+1; ++i) if (!tr_lin[i]) tr_lin[i] = new(float[NTRANS]);
2408 //   for (i=0; i<=L+1; ++i)
2409 //     {
2410 //       tr_lin[i][M2M] = fpow2(tr[i][M2M]);
2411 //       tr_lin[i][M2I] = fpow2(tr[i][M2I]);
2412 //       tr_lin[i][M2D] = fpow2(tr[i][M2D]);
2413 //       tr_lin[i][D2M] = fpow2(tr[i][M2D]);
2414 //       tr_lin[i][D2D] = fpow2(tr[i][D2D]);
2415 //       tr_lin[i][I2M] = fpow2(tr[i][I2M]);
2416 //       tr_lin[i][I2I] = fpow2(tr[i][I2I]);
2417 //     }
2418 // }
2419
2420
2421 // #define Weff(Neff) (1.0+par.neffa*(Neff-1.0)+(par.neffb-4.0*par.neffa)/16.0*(Neff-1.0)*(Neff-1.0))
2422
2423 // /////////////////////////////////////////////////////////////////////////////////////
2424 // // Initialize f[i][a] with query HMM
2425 // /////////////////////////////////////////////////////////////////////////////////////
2426 // void HMM::MergeQueryHMM(HMM& q, float wk[])
2427 // {
2428 //   int i;      // position in query
2429 //   int a;      // amino acid
2430 //   float Weff_M, Weff_D, Weff_I;
2431 //   for (i=1; i<=L; i++)
2432 //     {
2433 //       Weff_M = Weff(q.Neff_M[i]-1.0);
2434 //       Weff_D = Weff(q.Neff_D[i]-1.0);
2435 //       Weff_I = Weff(q.Neff_I[i]-1.0);
2436 //       for (a=0; a<20; a++) f[i][a] = q.f[i][a]*wk[i]*Weff_M;
2437 //       tr_lin[i][M2M] = q.tr_lin[i][M2M]*wk[i]*Weff_M;
2438 //       tr_lin[i][M2I] = q.tr_lin[i][M2I]*wk[i]*Weff_M;
2439 //       tr_lin[i][M2D] = q.tr_lin[i][M2D]*wk[i]*Weff_M;
2440 //       tr_lin[i][D2M] = q.tr_lin[i][D2M]*wk[i]*Weff_D;
2441 //       tr_lin[i][D2D] = q.tr_lin[i][D2D]*wk[i]*Weff_D;
2442 //       tr_lin[i][I2M] = q.tr_lin[i][I2M]*wk[i]*Weff_I;
2443 //       tr_lin[i][I2I] = q.tr_lin[i][I2I]*wk[i]*Weff_I;
2444 //     }
2445 // }
2446
2447
2448
2449 // /////////////////////////////////////////////////////////////////////////////////////
2450 // // Normalize probabilities in total merged super-HMM
2451 // /////////////////////////////////////////////////////////////////////////////////////
2452 // void HMM::NormalizeHMMandTransitionsLin2Log()
2453 // {
2454 //   int i;      // position in query
2455 //   int a;      // amino acid
2456 //   for (i=0; i<=L+1; i++)
2457 //     {
2458 //       float sum=0.0;
2459 //       for (a=0; a<20; a++) sum += f[i][a];
2460 //       for (a=0; a<20; a++) f[i][a]/=sum;
2461 //       sum = tr_lin[i][M2M] + tr_lin[i][M2I] + tr_lin[i][M2D];
2462 //       tr_lin[i][M2M] /= sum;
2463 //       tr_lin[i][M2I] /= sum;
2464 //       tr_lin[i][M2D] /= sum;
2465 //       tr[i][M2M] = fast_log2(tr_lin[i][M2M]);
2466 //       tr[i][M2I] = fast_log2(tr_lin[i][M2I]);
2467 //       tr[i][M2D] = fast_log2(tr_lin[i][M2D]);
2468 //       sum = tr_lin[i][D2M] + tr_lin[i][D2D];
2469 //       tr_lin[i][D2M] /= sum;
2470 //       tr_lin[i][D2D] /= sum;
2471 //       tr[i][D2M] = fast_log2(tr_lin[i][D2M]);
2472 //       tr[i][D2D] = fast_log2(tr_lin[i][D2D]);
2473 //       sum = tr_lin[i][I2M] + tr_lin[i][I2I];
2474 //       tr_lin[i][I2M] /= sum;
2475 //       tr_lin[i][I2I] /= sum;
2476 //       tr[i][I2M] = fast_log2(tr_lin[i][I2M]);
2477 //       tr[i][I2I] = fast_log2(tr_lin[i][I2I]);
2478 //    }
2479 // }
2480
2481
2482 // UNCOMMENT TO ACTIVATE COMPOSITIONALLY BIASED PSEUDOCOUNTS BY RESCALING THE RATE MATRIX
2483
2484 // /////////////////////////////////////////////////////////////////////////////////////
2485 // //// Function to minimize
2486 // /////////////////////////////////////////////////////////////////////////////////////
2487 // double RescaleMatrixFunc(double x[])
2488 // {
2489 //   double sum=0.0;
2490 //   for (int a=0; a<20; ++a)
2491 //     {
2492 //       double za=0.0;
2493 //       for (int b=0; b<20; ++b) za+=P[a][b]*x[b];
2494 //       sum += (x[a]*za-qav[a])*(x[a]*za-qav[a]);
2495 //     }
2496 //   return sum;
2497 // }
2498
2499 // /////////////////////////////////////////////////////////////////////////////////////
2500 // //// Gradient of function to minimize
2501 // /////////////////////////////////////////////////////////////////////////////////////
2502 // void RescaleMatrixFuncGrad(double grad[], double x[])
2503 // {
2504 //   double z[20] = {0.0};
2505 //   double w[20];
2506 //   double tmp;
2507 //   for (int a=0; a<20; ++a)
2508 //     for (int b=0; b<20; ++b) z[a] += P[a][b]*x[b];
2509
2510 //   for (int a=0; a<20; ++a) w[a] = x[a]*z[a]-qav[a];
2511 //   for (int a=0; a<20; ++a)
2512 //     {
2513 //       tmp = w[a]*z[a];
2514 //       for (int b=0; b<20; ++b) tmp += P[a][b]*x[b]*w[b];
2515 //       grad[a] = 2.0*tmp;
2516 //     }
2517 //   return;
2518 // }
2519
2520
2521 // /////////////////////////////////////////////////////////////////////////////////////
2522 // //// Rescale a substitution matrix to biased aa frequencies in global vector qav[a]
2523 // /////////////////////////////////////////////////////////////////////////////////////
2524 // void HMM::RescaleMatrix()
2525 // {
2526 //   int a,b;
2527 //   int code;
2528 //   double x[21];     // scaling factor
2529 //   double val_min;
2530 //   const int len=20;
2531 //   const int max_iterations=50;
2532
2533 //   if (v>=2) printf("Adjusting rate matrix to query amino acid composition ...\n");
2534
2535 //   // Put amino acid frequencies into global array (needed to call WNLIB's conjugate gradient method)
2536 //   for (a=0; a<20; ++a) qav[a] = pav[a];
2537
2538 //   // Initialize scaling factors x[a]
2539 //   for (a=0; a<20; ++a) x[a]=pow(qav[a]/pb[a],0.73);  // Initialize
2540
2541 //   // Call conjugate gradient minimization method from WNLIB
2542 //   wn_conj_gradient_method(&code,&val_min,x,len,&RescaleMatrixFunc,&RescaleMatrixFuncGrad,max_iterations);
2543
2544
2545 //   // Calculate z[a] = sum_b Pab*xb
2546 //   float sum_err=0.0f;
2547 //   float sum = 0.0f;
2548 //   for (a=0; a<20; ++a)
2549 //     {
2550 //       float za=0.0f; // za = sum_b Pab*xb
2551 //       for (b=0; b<20; ++b) za+=P[a][b]*x[b];
2552 //       sum_err += (x[a]*za/qav[a]-1)*(x[a]*za/qav[a]-1);
2553 //       sum += x[a]*za;
2554 //    }
2555 //   if (sum_err>1e-3 & v>=1) fprintf(stderr,"WARNING: adjusting rate matrix by CG resulted in residual error of %5.3f.\n",sum_err);
2556
2557 //   // Rescale rate matrix
2558 //   for (a=0; a<20; ++a)
2559 //     for (b=0; b<20; ++b)
2560 //       {
2561 //      P[a][b] *= x[a]*x[b]/sum;
2562 //      R[a][b] = P[a][b]/qav[b];
2563 //       }
2564
2565 //   // How well approximated?
2566 //   if (v>=3)
2567 //     {
2568 //       // Calculate z[a] = sum_b Pab*xb
2569 //       float z[21];
2570 //       for (a=0; a<20; ++a)
2571 //      for (z[a]=0.0, b=0; b<20; ++b) z[a]+=P[a][b];
2572 //       printf("Adjust   A    R    N    D    C    Q    E    G    H    I    L    K    M    F    P    S    T    W    Y    V\nErr?  ");
2573 //       for (a=0; a<20; ++a) printf("%4.0f ",1000*z[a]/qav[a]);
2574 //       cout<<endl<<"xa    ";
2575 //       for (a=0; a<20; ++a) fprintf(stdout,"%4.2f ",x[a]);
2576 //       cout<<endl;
2577 //     }
2578
2579 //   // Evaluate sequence identity underlying substitution matrix
2580 //   if (v>=3)
2581 //     {
2582 //       float id=0.0f;
2583 //       float entropy=0.0f;
2584 //       float entropy_qav=0.0f;
2585 //       float mut_info=0.0f;
2586 //       for (a=0; a<20; ++a) id += P[a][a];
2587 //       for (a=0; a<20; ++a)  entropy_qav-=qav[a]*fast_log2(qav[a]);
2588 //       for (a=0; a<20; ++a)
2589 //        for (b=0; b<20; ++b)
2590 //          {
2591 //            entropy-=P[a][b]*fast_log2(R[a][b]);
2592 //            mut_info += P[a][b]*fast_log2(P[a][b]/qav[a]/qav[b]);
2593 //          }
2594
2595 //       fprintf(stdout,"Rescaling rate matrix: sequence identity = %2.0f%%; entropy per column = %4.2f bits (out of %4.2f); mutual information = %4.2f bits\n",100*id,entropy,entropy_qav,mut_info);
2596 //     }
2597 //   return;
2598 // }
2599
2600
2601 /* @* HMM::ClobberGlobal (eg, q,t)
2602  */
2603 void
2604 HMM::ClobberGlobal(void){
2605
2606   for (int i = 0; i < n_display; i++){
2607     if (sname[i]){
2608       delete[] sname[i]; sname[i] = NULL;
2609     }
2610     if (seq[i]){
2611       delete[] seq[i]; seq[i] = NULL;
2612     }
2613   }
2614   Neff_M[0] = Neff_I[0] = Neff_D[0] = 0.0;
2615   longname[0] = '\0'; file[0] = '\0';
2616   ss_dssp[0] = sa_dssp[0] = ss_pred[0] = ss_conf[0] = '\0';
2617   Xcons   = NULL;
2618   l[0] = 0;
2619   L = 0;
2620   Neff_HMM = 0;
2621   n_display = N_in = N_filtered = 0;
2622   nss_dssp = nsa_dssp = nss_pred = nss_conf = nfirst = ncons = -1;
2623   lamda = 0.0; mu = 0.0;
2624   name[0] = longname[0] = fam[0] = '\0';
2625
2626   for (int i = 0; i < NAA; i++){
2627     pav[i] = 0;
2628   }
2629
2630   /* @= */
2631   return;
2632
2633 } /* this is the end of ClobberGlobal() */
2634
2635
2636 /*
2637  * EOF hhhmm-C.h
2638  */