binaries/src/clustalo/src/hhalign/hhalignment-C.h

   1 /* -*- mode: c; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
   2
   3 /*********************************************************************
   4  * Clustal Omega - Multiple sequence alignment
   5  *
   6  * Copyright (C) 2010 University College Dublin
   7  *
   8  * Clustal-Omega is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU General Public License as
  10  * published by the Free Software Foundation; either version 2 of the
  11  * License, or (at your option) any later version.
  12  *
  13  * This file is part of Clustal-Omega.
  14  *
  15  ********************************************************************/
  16
  17 /*
  18  *  RCS $Id: hhalignment-C.h 316 2016-12-16 16:14:39Z fabian $
  19  */
  20
  21
  22 /*
  23  * Changelog: Michael Remmert made changes to hhalign stand-alone code
  24  * FS implemented some of the changes on 2010-10-28 -> MR1
  25  *
  26  * Note: MR seems to have changed all [aijk]++ to ++[aijk],
  27  * FS did not do that on 2010-10-28
  28  */
  29
  30 // hhalignment.C
  31
  32 //////////////////////////////////////////////////////////////////////////////
  33 //// Class Alignment
  34 //////////////////////////////////////////////////////////////////////////////
  35
  36 // hhalignment.C
  37
  38 #ifndef MAIN
  39 #define MAIN
  40 #include <iostream> // cin, cout, cerr
  41 #include <fstream> // ofstream, ifstream
  42 #include <stdio.h> // printf
  43 using std::cout;
  44 using std::cerr;
  45 using std::endl;
  46 using std::ios;
  47 using std::ifstream;
  48 using std::ofstream;
  49 #include <stdlib.h> // exit
  50 #include <string> // strcmp, strstr
  51 #include <math.h> // sqrt, pow
  52 #include <limits.h> // INT_MIN
  53 #include <float.h> // FLT_MIN
  54 #include <time.h> // clock
  55 #include <ctype.h> // islower, isdigit etc
  56 #include "util-C.h" // imax, fmax, iround, iceil, ifloor, strint, strscn, strcut, substr, uprstr, uprchr, Basename etc.
  57 #include "list.h" // list data structure
  58 #include "hash.h" // hash data structure
  59 #include "hhdecl-C.h"
  60 #include "hhutil-C.h" // imax, fmax, iround, iceil, ifloor, strint, strscn, strcut, substr, uprstr, uprchr, Basename etc.
  61 #include "hhhmm.h"
  62 #endif
  63
  64 #ifdef HAVE_CONFIG_H
  65 #include "config.h"
  66 #endif
  67 //#include "new_new.h" /* memory tracking */
  68
  69
  70 enum {KEEP_NOT = 0, KEEP_CONDITIONALLY, KEEP_ALWAYS};
  71
  72 //////////////////////////////////////////////////////////////////////////////
  73 // Class Alignment
  74 //////////////////////////////////////////////////////////////////////////////
  75
  76
  77 //////////////////////////////////////////////////////////////////////////////
  78 // Object constructor
  79 //////////////////////////////////////////////////////////////////////////////
  80 Alignment::Alignment(int maxseq, int maxres)
  81 {
  82
  83     //printf(">>>>>>>>%s:%s:%d: maxseq=%d, maxres=%d\n", __FUNCTION__, __FILE__, __LINE__, maxseq, maxres); /* (FS) */
  84   longname = new(char[DESCLEN]);
  85   sname = new char*[maxseq+2]; /* MR1 */
  86   seq = new char*[maxseq+2]; /* MR1 */
  87   l = new int[maxres];
  88   X = new char*[maxseq+2];  /* MR1 */
  89   I = new short unsigned int*[maxseq+2]; /* MR1 */
  90   keep = new char[maxseq+2]; /* MR1 */
  91   display = new char[maxseq+2]; /* MR1 */
  92   wg = new float[maxseq+2]; /* MR1 */
  93   nseqs = new int[maxres+2]; /* MR1 */
  94   N_in=L=0;
  95   nres=NULL; // number of residues per sequence k
  96   first=NULL; // first residue in sequence k
  97   last=NULL; // last residue in sequence k
  98   ksort=NULL; // sequence indices sorted by descending nres[k]
  99   name[0]='\0'; // no name defined yet
 100   longname[0]='\0'; // no name defined yet
 101   fam[0]='\0'; // no name defined yet
 102   file[0]='\0'; // no name defined yet
 103   readCommentLine = '0'; /* MR1 */
 104 }
 105
 106 //////////////////////////////////////////////////////////////////////////////
 107 // Object destructor
 108 //////////////////////////////////////////////////////////////////////////////
 109 Alignment::~Alignment()
 110 {
 111   delete[] longname; longname = NULL;
 112   for(int k=0; k<N_in; k++)
 113     {
 114       delete[] sname[k]; sname[k] = NULL;
 115       delete[] seq[k]; seq[k] = NULL;
 116       delete[] X[k]; X[k] = NULL;
 117       delete[] I[k]; I[k] = NULL;
 118     }
 119   delete[] sname; sname = NULL;
 120   delete[] seq; seq = NULL;
 121   delete[] X; X = NULL;
 122   delete[] I; I = NULL;
 123   delete[] l; l = NULL;
 124   delete[] keep; keep = NULL;
 125   delete[] display; display = NULL;
 126   delete[] wg; wg = NULL;
 127   delete[] nseqs; nseqs = NULL;
 128   delete[] nres; nres = NULL;
 129   delete[] first; first = NULL;
 130   delete[] last; last = NULL;
 131   delete[] ksort; ksort = NULL;
 132 }
 133
 134
 135 /**
 136  * @brief Reads in an alignment from file into matrix seq[k][l] as ASCII
 137  */
 138 void
 139 Alignment::Read(FILE* inf, char infile[], char* firstline)
 140 {
 141   int l; // Postion in alignment incl. gaps (first=1)
 142   int h; // Position in input line (first=0)
 143   int k; // Index of sequence being read currently (first=0)
 144   char line[LINELEN]=""; // input line
 145   //char cur_seq[MAXCOL]; // Sequence currently read in
 146   char *cur_seq=new char[par.maxColCnt];
 147   char* cur_name; // Sequence currently read in
 148   int linenr=0; // current line number in input file
 149   char skip_sequence=0;
 150   RemoveExtension(file,infile); //copy rootname (w/o path) of infile into file variable of class object
 151
 152   kss_dssp=ksa_dssp=kss_pred=kss_conf=kfirst=-1;
 153   n_display=0;
 154   N_in=0;
 155   N_filtered=0;
 156   N_ss=0;
 157   cur_seq[0]=' '; // overwrite '\0' character at beginning to be able to do strcpy(*,cur_seq)
 158   l=1; k=-1;
 159
 160   // Does firstline already contain first line of file?
 161   if (firstline!= NULL) strcpy(line,firstline);
 162
 163   /////////////////////////////////////////////////////////////////////////
 164   // Read infile line by line
 165   /* FIXME: not safe to use MAXSEQ, however, don't think we ever get here (FS) */
 166   while(firstline || (fgetline(line,LINELEN,inf) && (k<MAXSEQ))) /* FIXME: FS introduced () around &&, precedence! MR1 */
 167       {
 168           linenr++;
 169           firstline=NULL;
 170           if (line[0]=='>') //line contains sequence name
 171               {
 172                   if (k>=MAXSEQ-1)
 173                       {
 174                           if (v>=1 && k>=MAXSEQ)
 175                               cerr<<endl<<"WARNING: maximum number "<<MAXSEQ<<" of sequences exceded in file "<<infile<<"\n";
 176                           break;
 177                       }
 178                   cur_name=line+1; //beginning of current sequence name
 179                   if (k>=0) //if this is at least the second name line
 180                       {
 181                           if (strlen(cur_seq)==0)
 182                               {
 183                                   cerr<<endl<<"Error: sequence "<<sname[k]<<" contains no residues."<<endl;
 184                                   exit(1);
 185                               }
 186
 187                           // Create space for residues and paste new sequence in
 188                           seq[k]=new char[strlen(cur_seq)+2];
 189                           if (!seq[k]) MemoryError("array for input sequences");
 190                           X[k]=new char[strlen(cur_seq)+2];
 191                           if (!X[k]) MemoryError("array for input sequences");
 192                           I[k]=new short unsigned int[strlen(cur_seq)+2];
 193                           if (!I[k]) MemoryError("array for input sequences");
 194                           strcpy(seq[k],cur_seq);
 195                       }
 196                   skip_sequence=0;
 197
 198                   k++;
 199                   l=1; //position in current sequence (first=1)
 200
 201                   // display[k]= 0: do not show in Q-T alignments 1: show if not filtered out later 2: show in any case (do not filter out)
 202                   // keep[k] = 0: do not include in profile 1: include if not filtered out later 2: include in any case (do not filter out)
 203                   /* {KEEP_NOT=0, KEEP_CONDITIONALLY=1, KEEP_ALWAYS=2} */
 204                   if (line[1]=='@') cur_name++; //skip @-character in name
 205                   if (!strncmp(line,">ss_dssp",8)) {
 206                       if (kss_dssp<0) {display[k]=2; n_display++; keep[k]=KEEP_NOT; kss_dssp=k; N_ss++;} else {skip_sequence=1; k--; continue;}
 207                   }
 208                   else if (!strncmp(line,">sa_dssp",8)) {
 209                       if (ksa_dssp<0) {display[k]=KEEP_ALWAYS; n_display++; keep[k]=KEEP_NOT; ksa_dssp=k; N_ss++;} else {skip_sequence=1; k--; continue;}
 210                   }
 211                   else if (!strncmp(line,">ss_pred",8)) {
 212                       if (kss_pred<0) {display[k]=KEEP_ALWAYS; n_display++; keep[k]=KEEP_NOT; kss_pred=k; N_ss++;} else {skip_sequence=1; k--; continue;}
 213                   }
 214                   else if (!strncmp(line,">ss_conf",8)) {
 215                       if (kss_conf<0) {display[k]=KEEP_ALWAYS; n_display++; keep[k]=KEEP_NOT; kss_conf=k; N_ss++;} else {skip_sequence=1; k--; continue;}
 216                   }
 217                   else if (!strncmp(line,">ss_",4) || !strncmp(line,">sa_",4)) {
 218                       display[k]=KEEP_ALWAYS; n_display++; keep[k]=KEEP_NOT; N_ss++;
 219                   }
 220                   else if (!strncmp(line,">aa_",4)) { // ignore sequences beginning with ">aa_"
 221                       skip_sequence=1; k--; continue;
 222                   }
 223                   //store first real seq
 224                   else if (kfirst<0)
 225                       {
 226                           char word[NAMELEN];
 227                           strwrd(word,line); // Copies first word in ptr to str
 228                           if (strstr(word,"_consensus"))
 229                               {display[k]=2; keep[k]=0; n_display++; kfirst=k;} /* MR1 */
 230                           else
 231                               {display[k]=keep[k]=KEEP_ALWAYS; n_display++; kfirst=k;}
 232                       }
 233                   //store all sequences
 234                   else if (par.mark==0) {display[k]=keep[k]=KEEP_CONDITIONALLY; n_display++;}
 235                   //store sequences up to nseqdis
 236                   else if (line[1]=='@'&& n_display-N_ss<par.nseqdis) {display[k]=keep[k]=KEEP_ALWAYS; n_display++;}
 237                   else {display[k]=KEEP_NOT; keep[k]=KEEP_CONDITIONALLY;}
 238
 239                   // store sequence name
 240                   if (v>=4) printf("Reading seq %-16.16s k=%3i n_displ=%3i display[k]=%i keep[k]=%i\n",cur_name,k,n_display,display[k],keep[k]);
 241                   sname[k] = new char[strlen(cur_name)+1];
 242                   if (!sname[k]) {MemoryError("array for sequence names");}
 243                   strcpy(sname[k],cur_name);
 244               } // end if(line contains sequence name)
 245
 246           else if (line[0]=='#') // Commentary line?
 247               {
 248                   // #PF01367.9 5_3_exonuc: 5'-3' exonuclease, C-terminal SAM fold; PDB 1taq, 1bgx (T:271-174), 1taq (271-174)
 249                   if (name[0]) continue; // if already name defined: skip commentary line
 250                   char *ptr1, *ptr2;
 251                   ptr1=strscn_(line+1); // set ptr1 to first non-whitespace character after '#' -> AC number
 252                   strncpy(longname,ptr1,DESCLEN-1); // copy whole commentary line after '# ' into longname
 253                   longname[DESCLEN-1]='\0';
 254                   strtr(longname,""," ");
 255                   ptr2=strcut_(ptr1); // cut after AC number and set ptr2 to first non-whitespace character after AC number
 256                   // strcpy(fam,ptr1); // copy AC number to fam
 257                   // if (!strncmp(fam,"PF",2)) strcut_(fam,'.'); // if PFAM identifier contains '.' cut it off
 258                   // strcut_(ptr2); // cut after first word ...
 259                   strcpy(name,ptr1); // ... and copy first word into name
 260                   readCommentLine = '1'; /* MR1 */
 261               }
 262
 263           //line contains sequence residues or SS information and does not belong to a >aa_ sequence
 264           else if (!skip_sequence)
 265               {
 266                   if (v>=4) cout<<line<<"\n"; //DEBUG
 267                   if (k==-1 && v)
 268                       {
 269                           cerr<<endl<<"WARNING: No sequence name preceding following line in "<<infile<<":\n\'"<<line<<"\'\n";
 270                           continue;
 271                       }
 272
 273                   h=0; //counts characters in current line
 274
 275                   // Check whether all characters are correct; store into cur_seq
 276                   if (keep[k] || (k == kfirst) ) // normal line containing residues /* MR1 */
 277                       {
 278                           while (h<LINELEN && line[h]>'\0' && l</*MAXCOL*/par.maxColCnt-1)
 279                               {
 280                                   if (aa2i(line[h])>=0) // ignore white-space characters ' ', \t and \n (aa2i()==-1)
 281                                       {cur_seq[l]=line[h]; l++;}
 282                                   else if (aa2i(line[h])==-2 && v)
 283                                       cerr<<endl<<"WARNING: invalid symbol \'"<<line[h]<<"\' at pos. "<<h<<" in line "<<linenr<<" of "<<infile<<"\n";
 284                                   h++;
 285                               }
 286                       }
 287                   else if (k==kss_dssp) // lines with dssp secondary structure states (. - H E C S T G B)
 288                       {
 289                           while (h<LINELEN && line[h]>'\0' && l</*MAXCOL*/par.maxColCnt-1)
 290                               {
 291                                   if (ss2i(line[h])>=0 && ss2i(line[h])<=7)
 292                                       {cur_seq[l]=ss2ss(line[h]); l++;}
 293                                   else if (v)
 294                                       cerr<<endl<<"WARNING: invalid symbol \'"<<line[h]<<"\' at pos. "<<h<<" in line "<<linenr<<" of "<<infile<<"\n";
 295                                   h++;
 296                               }
 297                       }
 298                   else if (k==ksa_dssp) // lines with dssp solvent accessibility states (. - ???)
 299                       {
 300                           while (h<LINELEN && line[h]>'\0' && l</*MAXCOL*/par.maxColCnt-1)
 301                               {
 302                                   if (sa2i(line[h])>=0)
 303                                       cur_seq[l++]=line[h];
 304                                   else if (v)
 305                                       cerr<<endl<<"WARNING: invalid symbol \'"<<line[h]<<"\' at pos. "<<h<<" in line "<<linenr<<" of "<<infile<<"\n";
 306                                   h++;
 307                               }
 308                       }
 309                   else if (k==kss_pred) // lines with predicted secondary structure (. - H E C)
 310                       {
 311                           while (h<LINELEN && line[h]>'\0' && l</*MAXCOL*/par.maxColCnt-1)
 312                               {
 313                                   if (ss2i(line[h])>=0 && ss2i(line[h])<=3)
 314                                       {cur_seq[l]=ss2ss(line[h]); l++;}
 315                                   else if (v)
 316                                       cerr<<endl<<"WARNING: invalid symbol \'"<<line[h]<<"\' at pos. "<<h<<" in line "<<linenr<<" of "<<infile<<"\n";
 317                                   h++;
 318                               }
 319                       }
 320                   else if (k==kss_conf) // lines with confidence values should contain only 0-9, '-', or '.'
 321                       {
 322                           while (h<LINELEN && line[h]>'\0' && l</*MAXCOL*/par.maxColCnt-1)
 323                               {
 324                                   if (line[h]=='-' || line[h]=='.' || (line[h]>='0' && line[h]<='9'))
 325                                       {cur_seq[l]=line[h]; l++;}
 326                                   else if (v)
 327                                       cerr<<endl<<"WARNING: invalid symbol \'"<<line[h]<<"\' at pos. "<<l<<" in line "<<linenr<<" of "<<infile<<"\n";
 328                                   h++;
 329                               }
 330                       }
 331                   else if (display[k]) // other lines such as >sa_pred etc
 332                       {
 333                           while (h<LINELEN && line[h]>'\0' && l</*MAXCOL*/par.maxColCnt-1)
 334                               {
 335                                   if (line[h]=='-' || line[h]=='.' || (line[h]>='0' && line[h]<='9') || (line[h]>='A' && line[h]<='B'))
 336                                       {cur_seq[l]=line[h]; l++;}
 337                                   else if (v)
 338                                       cerr<<endl<<"WARNING: invalid symbol \'"<<line[h]<<"\' at pos. "<<l<<" in line "<<linenr<<" of "<<infile<<"\n";
 339                                   h++;
 340                               }
 341                       }
 342                   if (v && l>=/*MAXCOL*/par.maxColCnt-1)
 343                       {
 344                           cerr<<endl<<"WARNING: maximum number of residues "<</*MAXCOL*/par.maxColCnt-2<<" exceded in sequence "<<sname[k]<<"\n";
 345                           skip_sequence=1;
 346                       }
 347                   cur_seq[l]='\0'; //Ensure that cur_seq ends with a '\0' character
 348               } //end else
 349
 350       }
 351   /////////////////////////////////////////////////////////////////////////
 352
 353
 354   if (k>=0) //if at least one sequence was read in
 355       {
 356           seq[k]=new char[strlen(cur_seq)+2];
 357           if (!seq[k]) MemoryError("array for input sequences");
 358           X[k]=new char[strlen(cur_seq)+2];
 359           if (!X[k]) MemoryError("array for input sequences");
 360           I[k]=new short unsigned int[strlen(cur_seq)+2];
 361           if (!I[k]) MemoryError("array for input sequences");
 362           strcpy(seq[k],cur_seq);
 363       }
 364   else
 365       {cerr<<endl<<"Error: no sequences found in file "<<infile<<"\n"; exit(1);}
 366
 367   N_in = k+1;
 368
 369   // Set name, longname, fam
 370   if (!*name) // longname, name and family were not set by '#...' line yet -> extract from first sequence
 371       {
 372           char* ptr;
 373           // strtr(sname[kfirst],"~"," "); // 'transpose': replaces the tilde with a blanc everywhere in sname[kfirst]
 374           strncpy(longname,sname[kfirst],DESCLEN-1); // longname is name of first sequence
 375           longname[DESCLEN-1]='\0';
 376           strncpy(name,sname[kfirst],NAMELEN-1); // Shortname is first word of longname...
 377           name[NAMELEN-1]='\0';
 378           ptr = strcut(name); // ...until first white-space character
 379           if (ptr && islower(ptr[0]) && ptr[1]=='.' && isdigit(ptr[2])) //Scop family code present as second word?
 380               {
 381                   lwrstr(name); // Transform upper case to lower case
 382                   strcut(ptr); // Non-white-space characters until next white-space character..
 383                   strcpy(fam,ptr); // ...are the SCOP familiy code
 384               }
 385           else if (name[0]=='P' && name[1]=='F' && isdigit(name[2]) && isdigit(name[3]) ) //Pfam code
 386               {
 387                   strcpy(fam,name); // set family name = Pfam code
 388               }
 389       }
 390
 391
 392
 393   delete[] cur_seq; cur_seq = NULL;
 394
 395   // Checking for warning messages
 396   if (v==0) return;
 397   if (v>=2) cout<<"Read "<<infile<<" with "<<N_in<<" sequences\n";
 398   if (v>=3) cout<<"Query sequence for alignment has number "<<kfirst<<" (0 is first)\n";
 399   return;
 400 }
 401
 402 /*
 403  * At this point GetSeqsFromHMM() slots in, however,
 404  * only needed in hhbliys.C, so will skip it for moment, MR1
 405  */
 406
 407
 408 /////////////////////////////////////////////////////////////////////////////
 409 /**
 410  * @brief  Convert ASCII in seq[k][l] to int (0-20) in X[k][i],
 411  *  throw out all insert states, record their number in I[k][i]
 412  *  and store sequences to be displayed in seq[k] */
 413 /////////////////////////////////////////////////////////////////////////////
 414 void
 415 Alignment::Compress(const char infile[])
 416 {
 417     int i; // Index for match state (first=1)
 418     int l; // Postion in alignment incl. gaps (first=1)
 419     int k; // Index for sequences (first=0)
 420     int a; // amino acid index
 421     char c;
 422     int unequal_lengths=0; /* k: seq k doesn't have same number
 423                               of match states as seq 0 => WARNING */
 424     /* points to next character in seq[k] to be written */
 425     /*static short unsigned int h[MAXSEQ];*/
 426     /*short*/ unsigned int *h = NULL; /* short may lead to overflow for long alignments, FS, r235 -> r236 */
 427
 428     h = new /*short*/ unsigned int[N_in+2]; /* short -> overflow, FS, r235 -> r236 */
 429     float *percent_gaps = NULL; /* FS, 2010-Nov */
 430     char *match_state = NULL;  /* FS, 2010-Nov */
 431
 432
 433     // Initialize
 434     for (k=0;k<N_in; k++)
 435         {I[k][0]=0;}
 436
 437     if (v>=3)
 438         {
 439             if (par.M==1)
 440                 cout<<"Using match state assignment by capital letters (a2m format)\n";
 441             else if (par.M==2) cout<<"Using percentage-rule match state assignment\n";
 442             else if (par.M==3) cout<<"Using residues of first sequence as match states\n";
 443         }
 444
 445     // Create matrices X and I with amino acids represented by integer numbers
 446     switch(par.M)
 447         {
 448
 449             /////////////////////////////////////////////////////////////////////////
 450             /* a2m/a3m format: match states capital case,
 451                inserts lower case, delete states '-', inserted gaps '.'
 452                The confidence values for ss prediction are interpreted as follows:
 453                0-9:match states(!) '-' :match state '.':insert */
 454         case 1:
 455         default:
 456
 457             // Warn if alignment is ment to be -M first or -M NN instead of A2M/A3M
 458             if (v>=2 && strchr(seq[kfirst],'-') ) // Seed/query sequence contains a gap ...
 459                 {
 460                     for (k=1; k<N_in; k++)
 461                         if (strpbrk(seq[k],"abcdefghiklmnpqrstuvwxyz.")) break;
 462                     if (k==N_in) // ... but alignment contains no lower case residue
 463                         printf("WARNING: input alignment %s looks like aligned FASTA instead of A2M/A3M format. Consider using '-M first' or '-M 50'\n",infile);
 464                 }
 465
 466             // Remove '.' characters from seq[k]
 467             for(k=0; k<N_in; k++)
 468                 {
 469                     char* ptrS=seq[k]; // pointer to source: character in seq[k]
 470                     char* ptrD=seq[k]; // pointer to destination: seq[k]
 471                     while(1) // omit '.' symbols
 472                         {
 473                             if (*ptrS!='.') {*ptrD=*ptrS; ptrD++;} //leave out '.' symbols
 474                             if (!*ptrS) break;
 475                             ptrS++;
 476                         }
 477                 }
 478             L=/*MAXRES*/par.maxResLen-2; // needed because L=imin(L,i)
 479             for (k=0; k<N_in; k++)
 480                 {
 481                     i=1; l=1; // start at i=1, not i=0!
 482                     if (keep[k]) //skip >ss_dssp, >ss_pred, >ss_conf, >aa_... sequences
 483                         {
 484                             while((c=seq[k][l++])) // assign residue to c at same time
 485                                 {
 486                                     if (c>='a' && c<='z') I[k][i-1]++;//insert state = lower case character
 487                                     else if (c!='.') //match state = upper case character
 488                                         {
 489                                             X[k][i]=aa2i(c);
 490                                             I[k][i]=0;
 491                                             i++;
 492                                         }
 493                                 }
 494                         }
 495                     else if (k==kss_dssp || k==kss_pred) // does alignment contain sequence of secondary structure states?
 496                         {
 497                             while((c=seq[k][l++])) // assign residue to c at same time
 498                                 if (c!='.' && !(c>='a' && c<='z')) X[k][i++]=ss2i(c); //match state = upper case character
 499                         }
 500                     else if (k==ksa_dssp) // does alignment contain sequence of prediction confidence values?
 501                         {
 502                             while((c=seq[k][l++])) // assign residue to c at same time
 503                                 if (c!='.' && !(c>='a' && c<='z')) X[k][i++]=sa2i(c); //match state = upper case character
 504                         }
 505                     else if (k==kss_conf) // does alignment contain sequence of prediction confidence values?
 506                         {
 507                             while((c=seq[k][l++])) // assign residue to c at same time
 508                                 if (c!='.') X[k][i++]=cf2i(c); //match state = 0-9 or '-'
 509                         }
 510                     else if (k==kfirst)        // does alignment contain sequence of prediction confidence values?
 511                         {
 512                             while((c=seq[k][l++]))  // assign residue to c at same time
 513                                 if (c!='.')
 514                                     {
 515                                         X[k][i]=aa2i(c);
 516                                         I[k][i]=0;
 517                                         ++i;
 518                                     }
 519                         }
 520                     else continue;
 521                     i--;
 522                     if (L!=i && L!=/*MAXRES*/par.maxResLen-2 && !unequal_lengths) unequal_lengths=k; //sequences have different lengths
 523                     L=imin(L,i);
 524                 } // end for (k)
 525             if (unequal_lengths) break;
 526
 527             //Replace GAP with ENDGAP for all end gaps /* MR1 */
 528             for (k=0; k<N_in; ++k)
 529                 {
 530                     if (!keep[k]) continue;
 531                     for (i=1; i<=L && X[k][i]==GAP; i++) X[k][i]=ENDGAP; /* MR1: NOTE i++ <- ++i */
 532                     for (i=L; i>=1 && X[k][i]==GAP; i--) X[k][i]=ENDGAP; /* MR1 */
 533                 }
 534
 535             for (i=1; i<=L; i++) this->l[i]=i; //assign column indices to match states
 536             if (L<=0)
 537                 {
 538                     cout<<"\nError: Alignment in "<<infile<<" contains no match states. Consider using -M first or -M <int> option"<<endl;
 539                     exit(1);
 540                 }
 541
 542             if (L==/*MAXRES*/par.maxResLen-2 && v>=2)
 543                 {
 544                     printf("WARNING: Number of match columns too large. Only first %i match columns will be kept!\n",L);
 545                     break;
 546                 }
 547             if (v>=2) cout<<"Alignment in "<<infile<<" contains "<<L<<" match states\n";
 548             break;
 549
 550             /////////////////////////////////////////////////////////////////////////
 551             // gap-rule assignment of match states
 552         case 2:
 553             int nl[NAA+2]; //nl[a] = number of seq's with amino acid a at position l
 554             /* Note: allocating statically is fine most of the time
 555                but when the sequences/profiles get really long
 556                we might run out of memory, so must really do it dynamically.
 557                had to move declaration of float *percent_gaps out of switch()
 558             */
 559             //float percent_gaps[MAXCOL]; //percentage of gaps in column k (with weighted sequences)
 560             percent_gaps = new float[par.maxColCnt];
 561
 562             //determine number of columns L in alignment
 563             L=strlen(seq[kfirst])-1;
 564
 565             // Conversion to integer representation, checking for unequal lengths and initialization
 566             if (nres==NULL) nres=new int[N_in];
 567 #if 0
 568 //#ifdef HAVE_OPENMP
 569 #pragma omp parallel for schedule(static), private(l)
 570 #endif
 571             for (k=0; k<N_in; k++)
 572                 {
 573                     if (!keep[k]) continue;
 574                     int nr=0;
 575                     wg[k]=0; nres[k]=0;
 576                     for (l=1; l<=L; l++)
 577                         {
 578                             X[k][l]=aa2i(seq[k][l]);
 579                             if (X[k][l]<NAA) nr++;
 580                         }
 581                     nres[k]=nr;
 582                     if (seq[k][L+1]!='\0' && !unequal_lengths) unequal_lengths=k;
 583                 }
 584             if (unequal_lengths) break;
 585
 586             // Quick and dirty calculation of the weight per sequence wg[k]
 587             for (l=1; l<=L; l++) // for all positions l in alignment
 588                 {
 589                     int naa=0; //number of different amino acids
 590                     for (a=0; a<20; a++) nl[a]=0;
 591                     for (k=0; k<N_in; k++) if (keep[k]) nl[ (int)X[k][l]]++;
 592                     for (a=0; a<20; a++) if(nl[a]) naa++;
 593                     if (!naa) naa=1; //naa=0 when column consists of only gaps and Xs (=ANY)
 594 #if 0
 595 //#ifdef HAVE_OPENMP
 596 #pragma omp parallel for schedule(static)
 597 #endif
 598                     for (k=0; k<N_in; k++)
 599                         if (keep[k] && (X[k][l]<20) )
 600                             {
 601                                 //wg[k]+=1.0/float(nl[ (int)X[k][l]]*naa*nres[k]+30.0); /* original version */
 602                                 wg[k] += 1.0/float(nl[ (int)X[k][l]]*naa*(nres[k]+30.0)); /* MR1 */
 603                                 // wg[k] += 1.0/float(nl[ (int)X[k][l]]*(nres[k]+30.0)); /* MR1 commented out */
 604                                 // wg[k] += (naa-1.0)/float(nl[ (int)X[k][l]]*(nres[k]+30.0)); /* MR1 commented out */
 605                             }
 606                 } /* 1=l<=L*/
 607
 608             //Replace GAP with ENDGAP for all end gaps
 609             for (k=0; k<N_in; ++k)
 610                 {
 611                     if (!keep[k]) continue;
 612                     for (i=1; i<=L && X[k][i]==GAP; i++) X[k][i]=ENDGAP; /* MR1: NOTE i++ <- ++i */
 613                     for (i=L; i>=1 && X[k][i]==GAP; i--) X[k][i]=ENDGAP; /* MR1 */
 614                 }
 615
 616             // Add up percentage of gaps
 617 #if 0
 618 //#ifdef HAVE_OPENMP
 619 #pragma omp parallel for schedule(static) private(k)
 620 #endif
 621             for (l=1; l<=L; l++)
 622                 {
 623                     float res=0;
 624                     float gap=0;
 625                     for (k=0; k< N_in; k++){
 626                         if (keep[k]){
 627                             if ( X[k][l]<GAP) res+=wg[k]; /* MR1, AA or ANY, changed from <ANY */
 628                             else if ( X[k][l] != ENDGAP) gap+=wg[k];  /* MR1, else: GAP. ENDGAPs are ignored for counting percentage */
 629                         }
 630                     }
 631                     percent_gaps[l]=100.*gap/(res+gap);
 632                     if (v>=4) cout<<"percent gaps["<<l<<"]="<<percent_gaps[l]<<" first seq:"<<seq[0][l]<<"\n";
 633                 }
 634
 635             /* Insert states 'bloat' the HMM,
 636                throwing them out 'slims' down the HMM.
 637                A slimmer HMM takes less time to construct.
 638                However, the marriage of Clustal and Hhalign
 639                is particularly sensitive to residues
 640                at the very end of the profile; these I call
 641                'telomeres'. Telomeres must not be shed when
 642                throwing out insert states, for the telomeres
 643                we set the match threshold to 100%.
 644              */
 645 #define MGAP_LOGIC 0
 646 #define TELOMERE_LOGIC 1
 647 #define TELOMERE_DYNAMIC 0
 648
 649 #define ALWAYS_ACCEPT 101.0 /* do NOT change this parameter, must be >=100,
 650                                make slightly bigger than 100% -- to be sure to be sure */
 651 #define DEFAULT_MGAPS 100.0 /* Soeding's default is 50, omega default prior to telomere logic was 100
 652                                FIXME: this used to be par.Mgaps,
 653                                in a later version re-introduce par.Mgaps to keep this value flexible */
 654 #define TELOMER_LENGTH 10   /* this parameter must be > 0 (unless DEFAULT_MGAPS=100),
 655                                if it is too big (L/2) then telomere logic has no effect,
 656                                don't think it should be changed (much) */
 657 #define TELOMER_FRACTION 0.10
 658             //#define HMM_MIN_LENGTH 0.923
 659 #define HMM_MIN_LENGTH 0.950
 660 #define FORTRAN_OFFSET 1
 661             double dDefaultMgaps;
 662             dDefaultMgaps = DEFAULT_MGAPS;
 663
 664 #if TELOMERE_LOGIC /* turn telomere logic on (1) or off (0) */
 665             int iTelomereLength;
 666
 667 #if TELOMERE_DYNAMIC /* keep telomere length 'dynamic' */
 668             iTelomereLength = TELOMER_LENGTH > (int)(L*TELOMER_FRACTION) ? TELOMER_LENGTH : (int)(L*TELOMER_FRACTION);
 669 #else
 670             iTelomereLength = TELOMER_LENGTH;
 671 #endif /* this was dynamic telomere */
 672 #endif /* this was telomere logic */
 673
 674             /* if HMMs get too small (much smaller than profile length L)
 675                then one is liable to get a back-tracking error.
 676                So we should ensure that the DEFAULT_MGAPS parameter does NOT
 677                shrink the HMM too much.
 678                take percentage-gap vector, sort it, and fix dDefaultMgaps,
 679                such that at least (HMM_MIN_LENGTH)*(L) are left
 680              */
 681 #if MGAP_LOGIC /* try to adapt Mgaps to size of final HMM */
 682             {
 683                 float *pfPercentGaps = NULL;
 684                 if (NULL == (pfPercentGaps = (float *)malloc((L+1)*sizeof(float)))){
 685                     printf("%s:%s:%d: could not malloc %d float for sorted percent-gaps\n",
 686                            __FUNCTION__, __FILE__, __LINE__, L+1);
 687                     dDefaultMgaps = DEFAULT_MGAPS;
 688                 }
 689                 else {
 690                     for (l = 0; l < L; l++) {
 691                         pfPercentGaps[l] = percent_gaps[l+FORTRAN_OFFSET];
 692                     }
 693                     qsort(pfPercentGaps, L, sizeof(float), CompFltAsc);
 694
 695                     dDefaultMgaps = pfPercentGaps[(int)(HMM_MIN_LENGTH*L)];
 696                     if (dDefaultMgaps < DEFAULT_MGAPS){
 697                         //printf("Mgaps = %f <- %f\n", DEFAULT_MGAPS, dDefaultMgaps);
 698                         dDefaultMgaps = DEFAULT_MGAPS;
 699                     }
 700                     else {
 701                         //printf("Mgaps = %f\n", dDefaultMgaps);
 702                     }
 703
 704                     free(pfPercentGaps); pfPercentGaps = NULL;
 705                 }
 706             }
 707 #endif /* tried to adapt Mgaps to size of final HMM */
 708
 709             // Throw out insert states and keep only match states
 710             i=0;
 711             for (k=0; k<N_in; k++) {h[k]=1; seq[k][0]='-';}
 712             for (l=1; l<=L; l++)
 713                 {
 714 #if TELOMERE_LOGIC
 715                     float fMgaps = ALWAYS_ACCEPT;
 716                     if ( (l < iTelomereLength) || (L-l < iTelomereLength) ){
 717                         /* residue is in telomere, always retain this position */
 718                         fMgaps = ALWAYS_ACCEPT;
 719                     }
 720                     else if (0){
 721                         /* FIXME: would like to put a transition phase in here,
 722                            where the Mgap value gradually goes down from 100 to DEFAULT_MGAPS,
 723                            however, may not be necessary and will make code more clunky */
 724                     }
 725                     else {
 726                         /* position is in centre of sequence,
 727                            retain position if less than DEFAULT_MGAPS% gaps at this position,
 728                            for example, if DEFAULT_MGAPS=30 throw out if more than 30% gap.
 729                            conversely, if DEFAULT_MGAPS=100 throw out if more than 100% gaps,
 730                            which can never happen, so always retain */
 731                         fMgaps = dDefaultMgaps;
 732                     }
 733                     if (percent_gaps[l] <= fMgaps)
 734 #else /* this was telomere logic */
 735                     if (percent_gaps[l]<=float(par.Mgaps))
 736 #endif /* this was Soeding default */
 737                         {
 738                             if (i>=/*MAXRES*/par.maxResLen-2) {
 739                                 if (v>=1)
 740                                     printf("WARNING: Number of match columns too large. Only first %i match columns will be kept!\n",i);
 741                                 break;
 742                             }
 743                             i++;
 744                             this->l[i]=l;
 745 #if 0
 746 //#ifdef HAVE_OPENMP
 747 #pragma omp parallel for schedule(static)
 748 #endif
 749                             for (k=0; k<N_in; k++)
 750                                 {
 751                                     if (keep[k])
 752                                         {
 753                                             seq[k][h[k]++]=MatchChr(seq[k][l]);
 754                                             X[k][i]=X[k][l];
 755                                             I[k][i]=0;
 756                                         }
 757                                     else if (k==kss_dssp || k==kss_pred)
 758                                         {
 759                                             seq[k][h[k]++]=MatchChr(seq[k][l]);
 760                                             X[k][i]=ss2i(seq[k][l]);
 761                                         }
 762                                     else if (k==ksa_dssp)
 763                                         {
 764                                             seq[k][h[k]++]=MatchChr(seq[k][l]);
 765                                             X[k][i]=sa2i(seq[k][l]);
 766                                         }
 767                                     else if (k==kss_conf)
 768                                         {
 769                                             seq[k][h[k]++]=seq[k][l];
 770                                             X[k][i]=cf2i(seq[k][l]);
 771                                         }
 772                                 }
 773                         }
 774                     else
 775                         {
 776                             for (k=0; k<N_in; k++)
 777                                 if (keep[k] && X[k][l]<GAP)
 778                                     {
 779                                         I[k][i]++;
 780                                         seq[k][h[k]++]=InsertChr(seq[k][l]);
 781                                     }
 782                         }
 783                 }
 784             for (k=0; k<N_in; k++) seq[k][h[k]]='\0';
 785
 786             //printf("%d\t%d\t%d\tN/L/M\n", N_in, L, i); /* -------- FIXME  */
 787
 788             if (v>=2) cout<<"Alignment in "<<infile<<" contains "<<L<<" columns and "<<i<<" match states\n";
 789             L = i; //Number of match states
 790
 791             delete[] percent_gaps; percent_gaps = NULL;
 792             break;
 793
 794
 795             ////////////////////////////////////////////////////////////////////////
 796             // Using residues of first sequence as match states
 797         case 3:
 798             /* Note: allocating statically is fine most of the time
 799                but when the sequences/profiles get really long
 800                we might run out of memory, so must really do it dynamically.
 801                had to move declaration of float *percent_gaps out of switch()
 802             */
 803             //char match_state[MAXCOL]; //1: column assigned to match state 0: insert state
 804             match_state = new char[par.maxColCnt];
 805
 806             // Determine number of columns L in alignment
 807             L=strlen(seq[0]+1);
 808             if (v>=3) printf("Length of first seq = %i\n",L);
 809             // Check for sequences with unequal lengths
 810             for (k=1; k<N_in; k++)
 811                 if (int(strlen(seq[k]+1))!=L) {unequal_lengths=k; break;}
 812             if (unequal_lengths) break;
 813
 814             // Determine match states: seq kfirst has residue at pos l -> match state
 815             for (l=1; l<=L; l++)
 816                 if (isalpha(seq[kfirst][l])) match_state[l]=1; else match_state[l]=0;
 817             // Throw out insert states and keep only match states
 818             for (k=0; k<N_in; k++) {h[k]=1; seq[k][0]='-';}
 819             i=0;
 820             for (l=1; l<=L; l++)
 821                 {
 822                     if (match_state[l]) // does sequence 0 have residue at position l?
 823                         {
 824                             if (i>=/*MAXRES*/par.maxResLen-2) {
 825                                 if (v>=1)
 826                                     printf("WARNING: Number of match columns too large. Only first %i match columns will be kept!\n",i);
 827                                 break;
 828                             }
 829                             i++;
 830                             this->l[i]=l;
 831                             for (k=0; k<N_in; k++)
 832                                 {
 833                                     if (keep[k])
 834                                         {
 835                                             seq[k][h[k]++]=MatchChr(seq[k][l]);
 836                                             X[k][i]=aa2i(seq[k][l]);
 837                                             I[k][i]=0;
 838                                         }
 839                                     else if (k==kss_dssp || k==kss_pred)
 840                                         {
 841                                             seq[k][h[k]++]=MatchChr(seq[k][l]);
 842                                             X[k][i]=ss2i(seq[k][l]);
 843                                         }
 844                                     else if (k==ksa_dssp)
 845                                         {
 846                                             seq[k][h[k]++]=MatchChr(seq[k][l]);
 847                                             X[k][i]=sa2i(seq[k][l]);
 848                                         }
 849                                     else if (k==kss_conf)
 850                                         {
 851                                             seq[k][h[k]++]=seq[k][l];
 852                                             X[k][i]=cf2i(seq[k][l]);
 853                                         }
 854                                 }
 855                         }
 856                     else
 857                         {
 858                             for (k=0; k<N_in; k++)
 859                                 if (keep[k] && aa2i(seq[k][l])<GAP)
 860                                     {
 861                                         I[k][i]++;
 862                                         seq[k][h[k]++]=InsertChr(seq[k][l]);
 863                                     }
 864                         }
 865                 }
 866             for (k=0; k<N_in; k++) seq[k][h[k]]='\0';
 867
 868             //Replace GAP with ENDGAP for all end gaps /* MR1 */
 869             for (k=0; k<N_in; ++k)
 870                 {
 871                     if (!keep[k]) continue;
 872                     for (i=1; i<=L && X[k][i]==GAP; i++) X[k][i]=ENDGAP; /* MR1, note i++ <- ++i */
 873                     for (i=L; i>=1 && X[k][i]==GAP; i--) X[k][i]=ENDGAP; /* MR1 */
 874                 }
 875
 876             if (v>=2) cout<<"Alignment in "<<infile<<" contains "<<L<<" columns and "<<i<<" match states\n";
 877             L = i; //Number of match states
 878
 879             delete[] match_state; match_state = NULL;
 880             break;
 881
 882         } //end switch()
 883     ///////////////////////////////////////////////////////////////////////////
 884
 885
 886     // Error
 887     if (unequal_lengths)
 888         {
 889             strcut(sname[unequal_lengths]);
 890             cerr<<endl<<"Error: sequences in "<<infile<<" do not all have the same number of columns, \ne.g. first sequence and sequence "<<sname[unequal_lengths]<<".\n";
 891             if(par.M==1) cerr<<".\nCheck input format for '-M a2m' option and consider using '-M first' or '-M 50'\n";
 892             exit(1);
 893         }
 894
 895     // Avert user about -cons option?
 896     if (v>=2 && !par.cons)
 897         {
 898             for (i=1; i<=L; i++)
 899                 if (X[kfirst][i]==GAP)
 900                     {
 901                         printf("NOTE: Use the '-cons' option to calculate a consensus sequence as first sequence of the alignment.\n");
 902                         break;
 903                     }
 904         }
 905     /* MR1
 906     //Replace GAP with ENDGAP for all end gaps
 907     for (k=0; k<N_in; k++)
 908     {
 909     if (!keep[k]) continue;
 910     for (i=1; i<=L && X[k][i]==GAP; i++) X[k][i]=ENDGAP;
 911     for (i=L; i>=1 && X[k][i]==GAP; i--) X[k][i]=ENDGAP;
 912     }*/
 913
 914     // DEBUG
 915     if (v>=4)
 916         for (k=0; k<N_in; k++)
 917             {
 918                 if (!display[k]) continue;
 919                 cout<<">"<<sname[k]<<"\n";
 920                 if (k==kss_dssp || k==kss_pred) {for (i=1; i<=L; i++) cout<<char(i2ss(X[k][i]));}
 921                 else if (k==kss_conf) {for (i=1; i<=L; i++) cout<<char(i2cf(X[k][i]));}
 922                 else if (k==ksa_dssp) {for (i=1; i<=L; i++) cout<<char(i2sa(X[k][i]));}
 923                 else
 924                     {
 925                         for (i=1; i<=L; i++) cout<<char(i2aa(X[k][i]));
 926                         cout<<"\n";
 927                         for (i=1; i<=L; i++)
 928                             if (I[k][i]==0) cout<<"-"; else if (I[k][i]>9) cout<<"X"; else cout<<I[k][i];
 929                     }
 930                 cout<<"\n";
 931             }
 932
 933     delete[](h); h = NULL;
 934 }
 935
 936
 937 /**
 938  * @brief Remove sequences with seq. identity larger than seqid percent
 939  *(remove the shorter of two) or coverage<cov_thr
 940  *
 941  * FIXME: originally max_seqid is a variable that is the cutoff
 942  *  above which sequences are thrown out. We want to throw out sequences
 943  *  when building the HMM but not for display, there we want to keep all.
 944  *  This should be really easy, but there is some hidden stuff going on
 945  *  in this function, so I did a minimal-invasive change and just stuck
 946  *  (effectively) a hard-wired 100 instead of the variable.
 947  *  At a later stage we should get rid of this function alltogether
 948  *  as it does gobble up some time (and is quadratic in noof sequences, I think)
 949  *  FS, 2010-10-04
 950  */
 951 ////////////////////////////////////////////////////////////////////////////
 952 /*
 953  */
 954 inline int
 955 Alignment::FilterForDisplay(int max_seqid, int coverage, int qid, float qsc, int N)
 956 {
 957
 958     /* FIXME
 959      * by just returning n_display and not doing anything
 960      * I think we display everything and not do any work for it
 961      */
 962     return n_display; /* FS, 2010-10-04*/
 963
 964
 965     if (par.mark) return n_display;
 966     char *dummy = new char[N_in+1];
 967     int vtmp=v, seqid;
 968     v=0;
 969     n_display=0;
 970     if (kss_dssp>=0) display[kss_dssp]=KEEP_NOT;
 971     if (ksa_dssp>=0) display[ksa_dssp]=KEEP_NOT;
 972     if (kss_pred>=0) display[kss_pred]=KEEP_NOT;
 973     if (kss_conf>=0) display[kss_conf]=KEEP_NOT;
 974     for (seqid=imin(10,max_seqid); n_display<N && seqid<=max_seqid; seqid++)
 975         {
 976             for (int k=0; k<N_in; k++) dummy[k]=display[k];
 977             n_display = Filter2(dummy,coverage,qid,qsc,20,seqid,0);
 978             // printf("Seqid=%3i n_display=%4i\n",seqid,n_display);
 979         }
 980     if (n_display>N)
 981         {
 982             for (int k=0; k<N_in; k++) dummy[k]=display[k];
 983             n_display = Filter2(dummy,coverage,qid,qsc,20,--(--seqid),0);
 984         }
 985     v=vtmp;
 986     for (int k=0; k<N_in; k++) display[k]=dummy[k];
 987     if (kss_dssp>=0) {display[kss_dssp]=KEEP_CONDITIONALLY; n_display++;}
 988     if (ksa_dssp>=0) {display[ksa_dssp]=KEEP_CONDITIONALLY; n_display++;}
 989     if (kss_pred>=0) {display[kss_pred]=KEEP_CONDITIONALLY; n_display++;}
 990     if (kss_conf>=0) {display[kss_conf]=KEEP_CONDITIONALLY; n_display++;}
 991     delete[] dummy; dummy = NULL;
 992     return n_display;
 993 }
 994
 995 /////////////////////////////////////////////////////////////////////////////////////
 996 // Remove sequences with seq. identity larger than seqid percent (remove the shorter of two) or coverage<cov_thr
 997 /////////////////////////////////////////////////////////////////////////////////////
 998 inline int Alignment::Filter(int max_seqid, int coverage, int qid, float qsc, int N)
 999 {
1000     return Filter2(keep,coverage,qid,qsc,20,max_seqid,N);
1001 }
1002
1003 /////////////////////////////////////////////////////////////////////////////
1004 /*
1005  * @brief Select set of representative sequences in the multiple sequence alignment
1006  *
1007  * Filter criteria:
1008  * Remove sequences with coverage of query less than "coverage" percent
1009  * Remove sequences with sequence identity to query of less than "qid" percent
1010  * If Ndiff==0, remove sequences with seq. identity larger than seqid2(=max_seqid) percent
1011  * If Ndiff>0, remove sequences with minimum-sequence-identity filter of between seqid1
1012  * and seqid2 (%), where the minimum seqid threshold is determined such that,
1013  * in all column blocks of at least WMIN=25 residues, at least Ndiff sequences are left.
1014  * This ensures that in multi-domain proteins sequences covering one domain are not
1015  * removed completely because sequences covering other domains are more diverse.
1016  *
1017  * Allways the shorter of two compared sequences is removed (=> sort sequences by length first).
1018  * Please note: sequence identity of sequence x with y when filtering x is calculated as
1019  * number of residues in sequence x that are identical to an aligned residue in y / number of residues in x
1020  * Example: two sequences x and y are 100% identical in their overlapping region but one overlaps by 10% of its
1021  * length on the left and the other by 20% on the right. Then x has 10% seq.id with y and y has 20% seq.id. with x.
1022  */
1023 //////////////////////////////////////////////////////////////////////////////
1024 int
1025 Alignment::Filter2(char keep[], int coverage, int qid, float qsc, int seqid1, int seqid2, int Ndiff)
1026 {
1027     // In the beginnning, keep[k] is 1 for all regular amino acid sequences and 0 for all others (ss_conf, ss_pred,...)
1028     // In the end, keep[k] will be 1 for all regular representative sequences kept in the alignment, 0 for all others
1029     char* in=new char[N_in+1]; // in[k]=1: seq k has been accepted; in[k]=0: seq k has not yet been accepted at current seqid
1030     char* inkk=new char[N_in+1]; // inkk[k]=1 iff in[ksort[k]]=1 else 0;
1031     int* Nmax=new int[L+2]; // position-dependent maximum-sequence-identity threshold for filtering /* MR1, used to be called idmax*/
1032     int* idmaxwin=new int[L+2]; // minimum value of idmax[i-WFIL,i+WFIL]
1033     int* seqid_prev=new int[N_in+1]; // maximum-sequence-identity threshold used in previous round of filtering (with lower seqid)
1034     int* N=new int[L+2]; // N[i] number of already accepted sequences at position i
1035     const int WFIL=25; // see previous line
1036
1037     int diffNmax=Ndiff;       // current  maximum difference of Nmax[i] and Ndiff /* MR1 */
1038     int diffNmax_prev=0;      // previous maximum difference of Nmax[i] and Ndiff /* MR1 */
1039
1040     int seqid; // current maximum value for the position-dependent maximum-sequence-identity thresholds in idmax[]
1041     int seqid_step=0;         // previous increment of seqid /* MR1 */
1042
1043     float diff_min_frac; // minimum fraction of differing positions between sequence j and k needed to accept sequence k
1044     float qdiff_max_frac=0.9999-0.01*qid; // maximum allowable number of residues different from query sequence
1045     int diff; // number of differing positions between sequences j and k (counted so far)
1046     int diff_suff; // number of differing positions between sequences j and k that would be sufficient
1047     int qdiff_max; // maximum number of residues required to be different from query
1048     int cov_kj; // upper limit of number of positions where both sequence k and j have a residue
1049     int first_kj; // first non-gap position in sequence j AND k
1050     int last_kj; // last non-gap position in sequence j AND k
1051     int kk, jj; // indices for sequence from 1 to N_in
1052     int k, j; // kk=ksort[k], jj=ksort[j]
1053     int i; // counts residues
1054     int n; // number of sequences accepted so far
1055
1056     // Initialize in[k]
1057     for (n=k=0; k<N_in; k++) if (keep[k]==KEEP_ALWAYS) {in[k]=2/*KEEP_ALWAYS??*/; n++;} else in[k]=0;
1058
1059     // Determine first[k], last[k]?
1060     if (first==NULL)
1061         {
1062             first=new int[N_in];// first non-gap position in sequence k
1063             last =new int[N_in];// last  non-gap position in sequence k
1064             for (k=0; k<N_in; k++) // do this for ALL sequences, not only those with in[k]==1 (since in[k] may be display[k])
1065                 {
1066                     for (i=1; i<=L; i++) if (X[k][i]<NAA) break;
1067                     first[k]=i;
1068                     for (i=L; i>=1; i--) if (X[k][i]<NAA) break;
1069                     last[k]=i;
1070                 }
1071         }
1072
1073     // Determine number of residues nres[k]?
1074         // KB: memory leak as sizeof(nres) just gives the size of an int pointer
1075     //if ( (nres==NULL)  || (sizeof(nres)<N_in*sizeof(int)) )
1076     if  (nres==NULL)
1077         {
1078             nres=new int[N_in];
1079             for (k=0; k<N_in; k++) // do this for ALL sequences, not only those with in[k]==1 (since in[k] may be display[k])
1080                 {
1081                     int nr=0;
1082                     for (i=first[k]; i<=last[k]; i++)
1083                         if (X[k][i]<NAA) nr++;
1084                     nres[k]=nr;
1085                     // printf("%20.20s nres=%3i first=%3i last=%3i\n",sname[k],nr,first[k],last[k]);
1086                 }
1087         }
1088
1089     // Sort sequences according to length; afterwards, nres[ksort[kk]] is sorted by size
1090     if (ksort==NULL)
1091         {
1092             ksort=new int[N_in]; // never reuse alignment object for new alignment with more sequences
1093             for (k=0; k<N_in; k++) ksort[k]=k;
1094             QSortInt(nres,ksort,kfirst+1,N_in-1,-1); //Sort sequences after kfirst (query) in descending order
1095         }
1096     for (kk=0; kk<N_in; kk++) inkk[kk]=in[ksort[kk]];
1097
1098     // Initialize N[i], idmax[i], idprev[i]
1099     for (i=1; i<first[kfirst]; i++) N[i]=0;
1100     for (i=first[kfirst]; i<=last[kfirst]; i++) N[i]=1;
1101     for (i=last[kfirst]+1; i<=L; i++) N[i]=0;
1102     //for (i=1; i<=L; i++) {idmax[i]=seqid1; idmaxwin[i]=-1;}
1103     for (i=1; i<=L; ++i) {Nmax[i]=0; idmaxwin[i]=-1;} /* MR1 */
1104     for (k=0; k<N_in; k++) seqid_prev[k]=-1;
1105     if (Ndiff<=0 || Ndiff>=N_in) {seqid1=seqid2; Ndiff=N_in; diffNmax=Ndiff;}
1106
1107     // Check coverage and sim-to-query criteria for each sequence k
1108     for (k=0; k<N_in; k++)
1109         {
1110             if (keep[k]==KEEP_NOT || keep[k]==KEEP_ALWAYS) continue; // seq k not regular sequence OR is marked sequence
1111             if (100*nres[k]<coverage*L) {keep[k]=KEEP_NOT; continue;} // coverage too low? => reject once and for all
1112
1113             float qsc_sum=0.0;
1114
1115             // Check if score-per-column with query is at least qsc
1116             if (qsc>-10)
1117                 {
1118                     float qsc_min = qsc*nres[k]; // minimum total score of seq k with query
1119
1120                     int gapq=0, gapk=0; // number of consecutive gaps in query or k'th sequence at position i
1121                     for (int i=first[k]; i<=last[k]; i++)
1122                         {
1123                             if (X[k][i]<20)
1124                                 {
1125                                     gapk=0;
1126                                     if (X[kfirst][i]<20)
1127                                         {
1128                                             gapq=0;
1129                                             qsc_sum += S[(int)X[kfirst][i]][(int)X[k][i]];
1130                                         }
1131                                     else if (gapq++) qsc_sum-=PLTY_GAPEXTD; else qsc_sum-=PLTY_GAPOPEN;
1132                                 }
1133                             else if (X[kfirst][i]<20)
1134                                 {
1135                                     gapq=0;
1136                                     if (gapk++) qsc_sum-=PLTY_GAPEXTD; else qsc_sum-=PLTY_GAPOPEN;
1137                                 }
1138                         }
1139                     // printf("k=%3i qsc=%6.2f\n",k,qsc_sum);
1140                     if (qsc_sum<qsc_min) {keep[k]=KEEP_NOT; continue;} // too different from query? => reject once and for all
1141                 }
1142
1143             //Check if sequence similarity with query at least qid?
1144             if (qdiff_max_frac<0.999)
1145                 {
1146                     qdiff_max=int(qdiff_max_frac*nres[k]+0.9999);
1147                     // printf("k=%-4i nres=%-4i qdiff_max=%-4i first=%-4i last=%-4i",k,nres[k],qdiff_max,first[k],last[k]);
1148                     diff=0;
1149                     for (int i=first[k]; i<=last[k]; i++)
1150                         // enough different residues to reject based on minimum qid with query? => break
1151                         if (X[k][i]<20 && X[k][i]!=X[kfirst][i] && ++diff>=qdiff_max) break;
1152                     // printf(" diff=%4i\n",diff);
1153                     if (diff>=qdiff_max) {keep[k]=KEEP_NOT; continue;} // too different from query? => reject once and for all
1154                 }
1155             // printf(" qsc=%6.2f qid=%6.2f \n",qsc_sum/nres[k],100.0*(1.0-(float)(diff)/nres[k]));
1156         } // end for (k)
1157
1158     if (seqid1>seqid2)
1159         {
1160             for (n=k=0; k<N_in; k++) if (keep[k]>KEEP_NOT) n++;
1161             return n;
1162         }
1163
1164     // Successively increment idmax[i] at positons where N[i]<Ndiff
1165     //for (seqid=seqid1; seqid<=seqid2; seqid+=1+(seqid>=50)) /* MR1 */
1166     seqid=seqid1;
1167     while (seqid<=seqid2)
1168         {
1169             /*
1170             // Update idmax[i]
1171             for (i=1; i<=L; i++) if (N[i]<Ndiff) idmax[i]=seqid;
1172
1173             // Update idmaxwin[i] as minimum of idmax[i-WFIL,i+WFIL]. If idmaxwin[] has not changed then stop
1174             char stop=1;
1175             for (i=1; i<=L; i++)
1176             {
1177             int idmax_min=seqid2;
1178             for (j=imax(1,imin(L-2*WFIL+1,i-WFIL)); j<=imin(L,imax(2*WFIL,i+WFIL)); j++)
1179             if (idmax[j]<idmax_min) idmax_min=idmax[j];
1180             if (idmax_min>idmaxwin[i]) stop=0; // idmaxwin[i] has changed => do not stop
1181             idmaxwin[i]=idmax_min;
1182             }
1183             */
1184             char stop=1;
1185             // Update Nmax[i]
1186             diffNmax_prev = diffNmax;
1187             diffNmax = 0;
1188             for (i=1; i<=L; ++i)
1189                 {
1190                     int max=0;
1191                     for (j=imax(1,imin(L-2*WFIL+1,i-WFIL)); j<=imin(L,imax(2*WFIL,i+WFIL)); ++j)
1192                         if (N[j]>max) max=N[j];
1193                     if (Nmax[i]<max) Nmax[i]=max;
1194                     if (Nmax[i]<Ndiff)
1195                         {
1196                             stop=0;
1197                             idmaxwin[i]=seqid;
1198                             if (diffNmax<Ndiff-Nmax[i]) diffNmax=Ndiff-Nmax[i];
1199                         }
1200
1201                 }
1202
1203             //printf("seqid=%3i  diffNmax_prev= %-4i   diffNmax= %-4i   n=%-5i  N_in-N_ss=%-5i\n",seqid,diffNmax_prev,diffNmax,n,N_in-N_ss);
1204
1205             if (stop) break;
1206
1207             // // DEBUG
1208             // printf("idmax ");
1209             // for (i=1; i<=L; i++) printf("%2i ",idmax[i]);
1210             // printf("\n");
1211             // printf("idmaxwin ");
1212             // for (i=1; i<=L; i++) printf("%2i ",idmaxwin[i]);
1213             // printf("\n");
1214             // printf("N[i] ");
1215             // for (i=1; i<=L; i++) printf("%2i ",N[i]);
1216             // printf("\n");
1217
1218             // Loop over all candidate sequences kk (-> k)
1219 #if 0
1220 //#ifdef HAVE_OPENMP
1221 #pragma omp parallel for schedule(static) private(k, i, diff_min_frac, jj, j, first_kj, last_kj, cov_kj, diff_suff, diff)
1222 #endif
1223             for (kk=0; kk<N_in; kk++)
1224                 {
1225                     if (inkk[kk]) continue; // seq k already accepted
1226                     k=ksort[kk];
1227                     if (!keep[k]) continue; // seq k is not regular aa sequence or already suppressed by coverage or qid criterion
1228                     if (keep[k]==KEEP_ALWAYS) {inkk[kk]=2; continue;} // accept all marked sequences (no n++, since this has been done already)
1229
1230                     // Calculate max-seq-id threshold seqidk for sequence k (as maximum over idmaxwin[i])
1231 //                    if (seqid>=100) {in[k]=inkk[kk]=1; n++; continue;}
1232                     if (seqid>=100) {
1233                         in[k]=inkk[kk]=1;
1234 #if 0
1235 //#ifdef HAVE_OPENMP
1236 #pragma omp atomic
1237 #endif
1238                         n++;
1239                         continue;
1240                     }
1241                     float seqidk=seqid1;
1242                     for (i=first[k]; i<=last[k]; i++)
1243                         if (idmaxwin[i]>seqidk) seqidk=idmaxwin[i];
1244                     if (seqid==seqid_prev[k]) continue; // sequence has already been rejected at this seqid threshold => reject this time
1245                     seqid_prev[k]=seqid;
1246                     diff_min_frac =0.9999-0.01*seqidk; // min fraction of differing positions between sequence j and k needed to accept sequence k
1247
1248                     // Loop over already accepted sequences
1249                     for (jj=0; jj<kk; jj++)
1250                         {
1251                             if (!inkk[jj]) continue;
1252                             j=ksort[jj];
1253                             first_kj=imax(first[k],first[j]);
1254                             last_kj =imin(last[k],last[j]);
1255                             cov_kj = last_kj-first_kj+1;
1256                             diff_suff=int(diff_min_frac*imin(nres[k],cov_kj)+0.999); // nres[j]>nres[k] anyway because of sorting /* MR1 0.999 */
1257                             diff=0;
1258                             for (int i=first_kj; i<=last_kj; i++)
1259                                 {
1260                                     // enough different residues to accept? => break
1261                                     if (X[k][i]>=NAA || X[j][i]>=NAA)
1262                                         cov_kj--;
1263                                     else
1264                                         if (X[k][i]!=X[j][i] && ++diff>=diff_suff) break; // accept (k,j)
1265                                 }
1266                             // // DEBUG
1267                             // printf("%20.20s with %20.20s: diff=%i diff_min_frac*cov_kj=%f diff_suff=%i nres=%i cov_kj=%i\n",sname[k],sname[j],diff,diff_min_frac*cov_kj,diff_suff,nres[k],cov_kj);
1268                             // printf("%s\n%s\n\n",seq[k],seq[j]);
1269
1270                             //if (float(diff)<fmin(diff_min_frac*cov_kj,diff_suff)) break; //similarity > acceptace threshold? Reject! /* MR1 */
1271                             if (diff<diff_suff && float(diff)<=diff_min_frac*cov_kj) break; //dissimilarity < acceptace threshold? Reject! /* MR1 */
1272
1273
1274                         }
1275                     if (jj>=kk) // did loop reach end? => accept k. Otherwise reject k (the shorter of the two)
1276                         {
1277                             in[k]=inkk[kk]=1;
1278 #if 0
1279 //#ifdef HAVE_OPENMP
1280 #pragma omp atomic
1281 #endif
1282                             n++;
1283                             for (i=first[k]; i<=last[k]; i++) {
1284 #if 0
1285 //#ifdef HAVE_OPENMP
1286 #pragma omp atomic
1287 #endif
1288                                 N[i]++; // update number of sequences at position i
1289                             }
1290                             // printf("%i %20.20s accepted\n",k,sname[k]);
1291                         }
1292                     // else
1293                     // {
1294                     // printf("%20.20s rejected: too similar with seq %20.20s diff=%i diff_min_frac*cov_kj=%f diff_suff=%i nres=%i cov_kj=%i\n",sname[k],sname[j],diff,diff_min_frac*cov_kj,diff_suff,nres[k],cov_kj);
1295                     // printf("%s\n%s\n\n",seq[k],seq[j]);
1296                     // }
1297
1298                 } // End Loop over all candidate sequences kk
1299
1300             // // DEBUG
1301             // printf("\n");
1302             // printf("seqid_prev[k]= \n");
1303             // for (k=0; k<N_in; k++) printf("%2i ",seqid_prev[k]);
1304             // printf("\n");
1305
1306             // Increment seqid /* MR1 */
1307             seqid_step = imax(1,imin(5,diffNmax/(diffNmax_prev-diffNmax+1)*seqid_step/2));
1308             seqid += seqid_step;
1309
1310         } // End Loop over seqid
1311
1312     if (v>=2)
1313         {
1314             printf("%i out of %i sequences passed filter (",n,N_in-N_ss);
1315             if (par.coverage)
1316                 printf("%i%% min coverage, ",coverage);
1317             if (qid)
1318                 printf("%i%% min sequence identity to query, ",qid);
1319             if (qsc>-10)
1320                 printf("%.2f bits min score per column to query, ",qsc);
1321             if (Ndiff<N_in && Ndiff>0)
1322                 printf("up to %i%% position-dependent max pairwise sequence identity)\n",seqid);
1323             else
1324                 printf("%i%% max pairwise sequence identity)\n",seqid1);
1325         }
1326
1327     for (k=0; k<N_in; k++) keep[k]=in[k];
1328     delete[] in; in = NULL;
1329     delete[] inkk; inkk = NULL;
1330     //delete[] idmax; idmax = NULL;
1331     delete[] Nmax; /* MR1 */
1332     delete[] idmaxwin; idmaxwin = NULL;
1333     delete[] seqid_prev; seqid_prev = NULL;
1334     delete[] N; N = NULL;
1335 #if 0
1336     printf("%s:%s:%d: sequences accepted = %d/%d\n", __FUNCTION__, __FILE__, __LINE__, n, N_in-N_ss);
1337 #endif
1338     return n;
1339 }
1340
1341
1342
1343 /* MR1: the Alignment::HomologyFilter is no longer needed in hhalign-stand-alone */
1344 /////////////////////////////////////////////////////////////////////////////
1345 /**
1346  * @brief Filter for min score per column coresc with core query profile,
1347  *   defined by coverage_core and qsc_core
1348  */
1349 /////////////////////////////////////////////////////////////////////////////
1350 int
1351 Alignment::HomologyFilter(int coverage_core, float qsc_core, float coresc)
1352 {
1353     const int seqid_core=90; //maximum sequence identity in core alignment
1354     const int qid_core=0;
1355     const int Ndiff_core=0;
1356     int n;
1357     HMM qcore;
1358     char* coreseq=new char[N_in]; // coreseq[k]=1 if sequence belongs to core of alignment (i.e. it is very similar to query)
1359     for (int k=0; k<N_in; k++) coreseq[k]=keep[k]; // Copy keep[] into coreseq[]
1360
1361     // Remove sequences with seq. identity larger than seqid percent (remove the shorter of two)
1362     int v1=v; v=1;
1363     n = Filter2(coreseq,coverage_core,qid_core,qsc_core,seqid_core,seqid_core,Ndiff_core);
1364     v=v1;
1365     if (v>=2)
1366         {
1367             printf("%i out of %i core alignment sequences passed filter (",n,N_in-N_ss);
1368             if (par.coverage_core)
1369                 printf("%i%% min coverage, ",coverage_core);
1370             if (qid_core)
1371                 printf("%i%% min sequence identity to query, ",qid_core);
1372             if (qsc_core>-10)
1373                 printf("%.2f bits min score per column to query, ",qsc_core);
1374             printf("%i%% max pairwise sequence identity)\n",seqid_core);
1375         }
1376
1377     // Calculate bare AA frequencies and transition probabilities -> qcore.f[i][a], qcore.tr[i][a]
1378     FrequenciesAndTransitions(qcore,coreseq);
1379
1380     // Add transition pseudocounts to query -> q.p[i][a] (gapd=1.0, gape=0.333, gapf=gapg=1.0, gaph=gapi=1.0, gapb=1.0
1381     qcore.AddTransitionPseudocounts(1.0,0.333,1.0,1.0,1.0,1.0,1.0);
1382
1383     // Generate an amino acid frequency matrix from f[i][a] with full pseudocount admixture (tau=1) -> g[i][a]
1384     qcore.PreparePseudocounts();
1385
1386     // Add amino acid pseudocounts to query: qcore.p[i][a] = (1-tau)*f[i][a] + tau*g[i][a]
1387     qcore.AddAminoAcidPseudocounts(2,1.5,2.0,1.0); // pcm=2, pca=1.0, pcb=2.5, pcc=1.0
1388
1389     // Filter out all sequences below min score per column with qcore
1390     n=FilterWithCoreHMM(keep, coresc, qcore);
1391
1392     if (v>=2) cout<<n<<" out of "<<N_in-N_ss<<" sequences filtered by minimum score-per-column threshold of "<<qsc_core<<"\n";
1393     delete[] coreseq; coreseq = NULL;
1394     return n;
1395 }
1396
1397
1398 /////////////////////////////////////////////////////////////////////////////////////
1399 /**
1400  * @brief Filter out all sequences below a minimum score per column with profile qcore
1401  */
1402 int
1403 Alignment::FilterWithCoreHMM(char in[], float coresc, HMM& qcore)
1404 {
1405     int k; // count sequences in alignment
1406     int i; // column in query alignment
1407     int a; // amino acid (0..19)
1408     int n=1; // number of sequences that passed filter
1409     float** logodds=new float*[L+1]; // log-odds ratios for HMM qcore
1410     char gap; // 1: previous state in seq k was a gap 0: previous state in seq k was an amino acid
1411     float score; // score of sequence k aligned with qcore
1412
1413     for (i=1; i<=L; i++) logodds[i]=new(float[21]);
1414
1415     // Determine first[k], last[k]?
1416     if (first==NULL)
1417         {
1418             first=new int[N_in];// first non-gap position in sequence k
1419             last =new int[N_in];// last non-gap position in sequence k
1420             for (k=0; k<N_in; k++) // do this for ALL sequences, not only those with in[k]==1 (since in[k] may be display[k])
1421                 {
1422                     for (i=1; i<=L; i++) if (X[k][i]<NAA) break;
1423                     first[k]=i;
1424                     for (i=L; i>=1; i--) if (X[k][i]<NAA) break;
1425                     last[k]=i;
1426                 }
1427         }
1428
1429     // Determine number of residues nres[k]?
1430     if (nres==NULL)
1431         {
1432             nres=new int[N_in];
1433             for (k=0; k<N_in; k++) // do this for ALL sequences, not only those with in[k]==1 (since in[k] may be display[k])
1434                 {
1435                     int nr=0;
1436                     for (i=first[k]; i<=last[k]; i++)
1437                         if (X[k][i]<NAA) nr++;
1438                     nres[k]=nr;
1439                     // printf("%20.20s nres=%3i first=%3i last=%3i\n",sname[k],nr,f,l);
1440                 }
1441         }
1442
1443     // Precalculate the log-odds for qcore
1444     for (i=1; i<=L; i++)
1445         {
1446             for (a=0; a<NAA; a++)
1447                 logodds[i][a]=fast_log2(qcore.p[i][a]/pb[a]);
1448             logodds[i][ANY]=-0.5; // half a bit penalty for X
1449
1450             // printf(" A R N D C Q E G H I L K M F P S T W Y V\n");
1451             // printf("%6i ",i);
1452             // for (a=0; a<20; ++a) fprintf(stdout,"%5.1f ",100*qcore.f[i][a]);
1453             // printf("\n");
1454             // printf(" ");
1455             // for (a=0; a<20; ++a) fprintf(stdout,"%5.1f ",100*qcore.g[i][a]);
1456             // printf("\n");
1457             // printf(" ");
1458             // for (a=0; a<20; ++a) fprintf(stdout,"%5.1f ",100*qcore.p[i][a]);
1459             // printf("\n");
1460             // printf(" ");
1461             // for (a=0; a<20; ++a) fprintf(stdout,"%5.1f ",100*pb[a]);
1462             // printf("\n");
1463             // printf(" ");
1464             // for (a=0; a<20; ++a) fprintf(stdout,"%5.2f ",fast_log2(qcore.p[i][a]/pb[a]));
1465             // printf("\n");
1466         }
1467
1468     // Main loop: test all sequences k
1469     for (k=kfirst+1; k<N_in; k++)
1470         {
1471             if (!in[k]) continue; // if in[k]==0 sequence k will be suppressed directly
1472
1473             float score_M=0.0;
1474             float score_prev=0.0;
1475
1476             // Calculate score of sequence k with core HMM
1477             score=0; gap=0;
1478             for (i=first[k]; i<=last[k]; i++)
1479                 {
1480                     score_M=0.0;
1481                     if (X[k][i]<=ANY) // current state is Match
1482                         {
1483                             score_M=logodds[i][ (int)X[k][i]];
1484                             score+=logodds[i][ (int)X[k][i]];
1485                             if (gap) score+=qcore.tr[i][D2M]; else score+=qcore.tr[i][M2M];
1486                             gap=0;
1487                         }
1488                     else if (X[k][i]==GAP) // current state is Delete (ignore ENDGAPs)
1489                         {
1490                             if (gap) score+=qcore.tr[i][D2D]; else score+=qcore.tr[i][M2D];
1491                             gap=1;
1492                         }
1493                     if (I[k][i]) score+=qcore.tr[i][M2I]+(I[k][i]-1)*qcore.tr[i][I2I]+qcore.tr[i][I2M];
1494                     // if (k==2) printf("i=%3i %c:%c score_M=%6.2f score=%6.2f score_sum=%6.2f \n",i,i2aa(X[kfirst][i]),i2aa(X[k][i]),score_M,score-score_prev,score);
1495                     score_prev=score;
1496                 }
1497
1498             printf("k=%3i score=%6.2f\n",k,score);
1499             if (score<nres[k]*coresc) in[k]=0; else n++;// reject sequence k?
1500         }
1501     for (i=1; i<=L; i++){
1502         delete[] logodds[i]; logodds[i] = NULL;
1503     }
1504     delete[] logodds; logodds = NULL;
1505     return n;
1506 }
1507
1508
1509 /* MR1 */
1510 #if 0
1511 /////////////////////////////////////////////////////////////////////////////////////
1512 /**
1513  * @brief Filter alignment to given diversity/Neff
1514  */
1515 bool
1516 Alignment::FilterNeff()
1517 {
1518     int v1=v;
1519     v=v1-1;
1520     const float TOLX=0.001;
1521     const float TOLY=0.02;
1522     char dummy[N_in+1];
1523     for (int k=0; k<N_in; ++k) dummy[k]=keep[k];
1524     float x=0.0,y=0.0;
1525     float x0=-1.0;
1526     float x1=+2.0;
1527     float y0=filter_by_qsc(x0,dummy);
1528     float y1=filter_by_qsc(x1,dummy);
1529     int i=2;
1530     while (y0-par.Neff>0 && par.Neff-y1>0)
1531         {
1532             x = x0 + (par.Neff-y0)*(x1-x0)/(y1-y0); // linear interpolation between (x0,y0) and (x1,y1)
1533             y = filter_by_qsc(x,dummy);
1534             if (v>=2) printf(" %3i  x0=%6.3f -> %6.3f     x=%6.3f -> %6.3f     x1=%6.3f -> %6.3f \n",++i,x0,y0,x,y,x1,y1);
1535             if (y>par.Neff) {x0=x; y0=y;} else {x1=x; y1=y;}
1536             if (fabs(par.Neff-y)<TOLY || x1-x0<TOLX) break;
1537         }
1538     v=v1;
1539
1540     if (y0>=par.Neff && y1<=par.Neff)
1541         {
1542             // Write filtered alignment WITH insert states (lower case) to alignment file
1543             if (v>=2) printf("Found Neff=%6.3f at filter threshold qsc=%6.3f\n",y,x);
1544             return true;
1545         }
1546     else if (v>=1)
1547         printf("Diversity of unfiltered alignment %.2f is below target diversity %.2f. No alignment written\n",y0,par.Neff);
1548
1549     return false;
1550 }
1551
1552 float Alignment::filter_by_qsc(float qsc, char* dummy)
1553 {
1554     HMM q;
1555     for (int k=0; k<N_in; ++k) keep[k]=dummy[k];
1556     Filter2(keep,par.coverage,0,qsc,par.max_seqid+1,par.max_seqid,0);
1557     FrequenciesAndTransitions(q);
1558     //   printf("qsc=%4.1f  N_filtered=%-3i  Neff=%6.3f\n",qsc,n,q.Neff_HMM);
1559     return q.Neff_HMM;
1560 }
1561 #endif
1562
1563 /////////////////////////////////////////////////////////////////////////////////////
1564 /**
1565  * @brief  Calculate AA frequencies q.p[i][a] and transition probabilities q.tr[i][a] from alignment
1566  */
1567 void
1568 Alignment::FrequenciesAndTransitions(HMM& q, char* in)
1569 {
1570     int k; // index of sequence
1571     int i; // position in alignment
1572     int a; // amino acid (0..19)
1573     int ni[NAA+3]; // number of times amino acid a occurs at position i
1574     int naa; // number of different amino acids
1575
1576     if (v>=3)
1577         cout<<"Calculating position-dependent weights on subalignments\n";
1578
1579     if (in==NULL) in=keep; // what's this good for?
1580
1581     if (N_filtered>1)
1582         {
1583             for (k=0; k<N_in; k++) wg[k]=0.0; // initialized wg[k]
1584             // Calculate global weights
1585             for (i=1; i<=L; i++) // for all positions i in alignment
1586                 {
1587                     for (a=0; a<20; a++) ni[a]=0;
1588                     for (k=0; k<N_in; k++) if (in[k]) ni[ (int)X[k][i]]++;
1589                     naa=0; for (a=0; a<20; a++) if(ni[a]) naa++;
1590                     if (!naa) naa=1; //naa=0 when column consists of only gaps and Xs (=ANY)
1591                     for (k=0; k<N_in; k++)
1592                         if (in[k] && X[k][i]<20)
1593                             wg[k] += 1.0/float(ni[ (int)X[k][i]]*naa*(nres[k]+30));
1594                     // ensure that each residue of a short sequence contributes as much as a residue of a long sequence:
1595                     // contribution is proportional to one over sequence length nres[k] plus 30.
1596                 }
1597             NormalizeTo1(wg,N_in);
1598
1599
1600             // Do pos-specific sequence weighting and calculate amino acid frequencies and transitions
1601             for (k=0; k<N_in; k++) X[k][0]=ENDGAP; // make sure that sequences ENTER subalignment j for j=1
1602             for (k=0; k<N_in; k++) X[k][L+1]=ENDGAP; // does it have an influence?
1603
1604 #if 0
1605 //#ifdef HAVE_OPENMP
1606             if(par.wg != 1)
1607             {
1608                 #pragma omp parallel sections
1609                 {
1610                     #pragma omp section
1611                     Amino_acid_frequencies_and_transitions_from_M_state(q,in); // use subalignments of seqs with residue in i
1612                     #pragma omp section
1613                     Transitions_from_I_state(q,in); // use subalignments of seqs with insert in i
1614                     #pragma omp section
1615                     Transitions_from_D_state(q,in); // use subalignments of seqs with delete in i. Must be last of these three calls if par.wg==1!
1616                 }
1617             }
1618             else
1619             {
1620                 #pragma omp parallel sections
1621                 {
1622                     #pragma omp section
1623                     Amino_acid_frequencies_and_transitions_from_M_state(q,in); // use subalignments of seqs with residue in i
1624                     #pragma omp section
1625                     Transitions_from_I_state(q,in); // use subalignments of seqs with insert in i
1626                 }
1627                 Transitions_from_D_state(q,in); // use subalignments of seqs with delete in i. Must be last of these three calls if par.wg==1!
1628             }
1629 #else
1630             Amino_acid_frequencies_and_transitions_from_M_state(q,in);
1631             Transitions_from_I_state(q,in);
1632             Transitions_from_D_state(q,in);
1633 #endif
1634         }
1635     else // N_filtered==1
1636         {
1637             X[kfirst][0]=X[kfirst][L+1]=ANY; // (to avoid anallowed access within loop)
1638             q.Neff_HMM=1.0f;
1639             for (i=0; i<=L+1; i++) // for all positions i in alignment
1640                 {
1641                     q.Neff_M[i]=1.0f;
1642                     q.Neff_I[i]=q.Neff_D[i]=0.0f;
1643                     for (a=0; a<20; a++) q.f[i][a]=0.0;
1644                     /* this is the crucial change that makes terminal-X work */
1645                     //q.f[i][ (int)(X[kfirst][i]) ] = 1.0; /* MR1 */
1646                     if (X[kfirst][i] < ANY) /* MR1 */
1647                         q.f[i][(unsigned int) X[kfirst][i] ] = 1.0;
1648                     else
1649                         for (a=0; a<20; ++a) q.f[i][a]=pb[a];
1650                     q.tr[i][M2M]=0;
1651                     q.tr[i][M2I]=-100000.0;
1652                     q.tr[i][M2D]=-100000.0;
1653                     q.tr[i][I2M]=-100000.0;
1654                     q.tr[i][I2I]=-100000.0;
1655                     q.tr[i][D2M]=-100000.0;
1656                     q.tr[i][D2D]=-100000.0;
1657                 }
1658             q.tr[0][I2M]=0;
1659             q.tr[L][I2M]=0;
1660             q.tr[0][D2M]=0;
1661             q.Neff_M[0]=q.Neff_I[0]=q.Neff_D[0]=99.999; // Neff_av[0] is used for calculation of transition pseudocounts for the start state
1662         }
1663
1664     if (v>=3)
1665         {
1666             printf("\nMatches:\n");
1667             printf("col Neff nseqs\n");
1668             for (i=1; i<=imin(L,100); i++)
1669                 printf("%3i %5.2f %3i\n",i,q.Neff_M[i],nseqs[i]);
1670
1671             printf("\nInserts:\n");
1672             printf("col Neff nseqs\n");
1673             for (i=1; i<=imin(L,100); i++)
1674                 printf("%3i %5.2f %3i\n",i,q.Neff_I[i],nseqs[i]);
1675
1676             printf("\nDeletes:\n");
1677             printf("col Neff nseqs\n");
1678             for (i=1; i<=imin(L,100); i++)
1679                 printf("%3i %5.2f %3i\n",i,q.Neff_D[i],nseqs[i]);
1680         }
1681
1682     // Copy column information into HMM q
1683     q.L=L;
1684     q.N_in=N_in;
1685     q.N_filtered=N_filtered;
1686     for (i=1; i<=L; i++) q.l[i]=l[i];
1687
1688     // Set names in HMM q
1689     if (strlen(q.name)==0) strcpy(q.name,name);
1690     if (strlen(q.longname)==0) strcpy(q.longname,longname);
1691     if (strlen(q.fam)==0) strcpy(q.fam,fam);
1692     ScopID(q.cl,q.fold,q.sfam,q.fam); // derive superfamily, fold and class code from family name
1693     strcpy(q.file,file); // Store basename of alignment file name in q.file
1694
1695     // Copy sequences to be displayed into HMM
1696     q.nss_dssp=q.nsa_dssp=q.nss_pred=q.nss_conf=q.nfirst=-1;
1697     int n=0;
1698     if (kss_dssp>=0) q.nss_dssp=n++; // copy dssp sequence?
1699     if (ksa_dssp>=0) q.nsa_dssp=n++; // copy dssp sequence?
1700     if (kss_pred>=0) q.nss_pred=n++; // copy psipred sequence?
1701     if (kss_conf>=0) q.nss_conf=n++; // copy confidence value sequence?
1702
1703     // Calculate consensus sequence?
1704     if (par.showcons || par.cons)
1705         {
1706             float maxw;
1707             int maxa;
1708             if (par.showcons)
1709                 {
1710                     // Reserve space for consensus/conservation sequence as Q-T alignment mark-up
1711                     q.ncons=n++;
1712                     q.sname[q.ncons]=new(char[10]);
1713                     if (!q.sname[q.ncons]) {MemoryError("array of names for displayed sequences");}
1714                     strcpy(q.sname[q.ncons],"Consensus");
1715                     q.seq[q.ncons]=new char[L+2];
1716                     if (!q.seq[q.ncons]) {MemoryError("array of names for displayed sequences");}
1717                 }
1718             if (par.cons)
1719                 {
1720                     // Reserve space for consensus sequence as first sequence in alignment
1721                     q.nfirst=n++; kfirst=-1;
1722                     q.sname[q.nfirst]=new char[strlen(name)+11];
1723                     if (!q.sname[q.nfirst]) {MemoryError("array of names for displayed sequences");}
1724                     strcpy(q.sname[q.nfirst],name);
1725                     strcat(q.sname[q.nfirst],"_consensus");
1726                     q.seq[q.nfirst]=new char[L+2];
1727                     if (!q.seq[q.nfirst]) {MemoryError("array of names for displayed sequences");}
1728                 }
1729             // Calculate consensus amino acids using similarity matrix
1730             for (i=1; i<=L; i++)
1731                 {
1732                     maxw=0.0; maxa=0;
1733                     for (a=0; a<20; a++)
1734                         if (q.f[i][a]-pb[a]>maxw) {maxw = q.f[i][a]-pb[a]; maxa = a;}
1735
1736                     if (par.showcons)
1737                         {
1738                             maxw =0.0;
1739                             for (int b=0; b<20; b++) maxw += q.f[i][b]*Sim[maxa][b]*Sim[maxa][b];
1740                             maxw *= q.Neff_M[i]/(q.Neff_HMM+1); // columns with many gaps don't get consensus symbol
1741                             if (maxw>0.6) q.seq[q.ncons][i] = uprchr(i2aa(maxa));
1742                             else if (maxw>0.4) q.seq[q.ncons][i] = lwrchr(i2aa(maxa));
1743                             else q.seq[q.ncons][i] = 'x';
1744                         }
1745                     if (par.cons) q.seq[q.nfirst][i] = uprchr(i2aa(maxa));
1746                 }
1747             if (par.showcons)
1748                 {
1749                     q.seq[q.ncons][0]='-';
1750                     q.seq[q.ncons][L+1]='\0';
1751                 }
1752             if (par.cons)
1753                 {
1754                     q.seq[q.nfirst][0]='-';
1755                     q.seq[q.nfirst][L+1]='\0';
1756                 }
1757         }
1758
1759     // Copy sequences to be displayed from alignment to HMM
1760     for (k=0; k<N_in; k++)
1761         {
1762             int nn;
1763             if (display[k])
1764                 {
1765                     if (0 && (n>=MAXSEQDIS)) {
1766                         /* FIXME: the test was if(n>=MAXSEQDIS),
1767                            this test was necessary because alignment memory was static,
1768                            now it should be dynamic, and should always have the right size,
1769                            there are at least number-of-sequences plus a 'bit' more
1770                            however, I do not know what that 'bit' is likely to be (in the future).
1771                            at the moment it is 1 for the consnseus and 1 for structure,
1772                            but this might change (FS)
1773                          */
1774                         if (par.mark) cerr<<"WARNING: maximum number "<<MAXSEQDIS<<" of sequences for display of alignment exceeded\n";
1775                         break;
1776                     }
1777                     if (k==kss_dssp) nn=q.nss_dssp; // copy dssp sequence to nss_dssp
1778                     else if (k==ksa_dssp) nn=q.nsa_dssp;
1779                     else if (k==kss_pred) nn=q.nss_pred;
1780                     else if (k==kss_conf) nn=q.nss_conf;
1781                     else if (k==kfirst) nn=q.nfirst=n++;
1782                     else nn=n++;
1783                     // strcut(sname[k]," "); // delete rest of name line beginning with two spaces " " // Why this?? Problem for pdb seqs without chain
1784                     q.sname[nn]=new char[strlen(sname[k])+1];
1785                     if (!q.sname[nn]) {MemoryError("array of names for displayed sequences");}
1786                     strcpy(q.sname[nn],sname[k]);
1787                     q.seq[nn]=new char[strlen(seq[k])+1];
1788                     if (!q.seq[nn]) {MemoryError("array of names for displayed sequences");}
1789                     strcpy(q.seq[nn],seq[k]);
1790                 }
1791         }
1792     q.n_display=n; // how many sequences to be displayed in alignments?
1793
1794     // Copy secondary structure information into HMM
1795     if (kss_dssp>=0)
1796         for (i=1; i<=L; i++) q.ss_dssp[i]=X[kss_dssp][i];
1797     if (ksa_dssp>=0)
1798         for (i=1; i<=L; i++) q.sa_dssp[i]=X[ksa_dssp][i];
1799     if (kss_pred>=0)
1800         {
1801             for (i=1; i<=L; i++) q.ss_pred[i]=X[kss_pred][i];
1802             if (kss_conf>=0)
1803                 for (i=1; i<=L; i++) q.ss_conf[i]=X[kss_conf][i];
1804             else
1805                 for (i=1; i<=L; i++) q.ss_conf[i]=5;
1806         }
1807
1808     q.lamda=0.0;
1809     q.mu=0.0;
1810
1811     // Debug: print occurence of amino acids for each position i
1812     if (v>=2) printf("Effective number of sequences exp(entropy) = %-4.1f\n",q.Neff_HMM); //PRINT
1813     if (v>=3)
1814         {
1815             cout<<"\nMatr: ";
1816             for (a=0; a<20; a++) printf("%4.1f ",100*pb[a]);
1817             cout<<"\nAmino acid frequencies without pseudocounts:\n";
1818             cout<<" A R N D C Q E G H I L K M F P S T W Y V\n";
1819             for (i=1; i<=L; i++)
1820                 {
1821                     printf("%3i: ",i);
1822                     for (a=0; a<20; a++) printf("%4.0f ",100*q.f[i][a]);
1823                     cout<<endl;
1824                 }
1825             cout<<"\n";
1826
1827             printf("\nListing transition probabilities without pseudocounts:\n");
1828             printf(" i M->M M->I M->D I->M I->I D->M D->D Neff_M Neff_I Neff_D\n");
1829             for (i=0; i<=L; i++)
1830                 {
1831                     printf("%4i %6.3f %6.3f %6.3f ",i,pow(2.0,q.tr[i][M2M]),pow(2.0,q.tr[i][M2I]),pow(2.0,q.tr[i][M2D]));
1832                     printf("%6.3f %6.3f ",pow(2.0,q.tr[i][I2M]),pow(2.0,q.tr[i][I2I]));
1833                     printf("%6.3f %6.3f ",pow(2.0,q.tr[i][D2M]),pow(2.0,q.tr[i][D2D]));
1834                     printf("%6.3f %6.3f %6.3f\n",q.Neff_M[i],q.Neff_I[i],q.Neff_D[i]);
1835                 }
1836         }
1837     q.trans_lin=0;
1838     q.has_pseudocounts=false; /* MR1 */
1839
1840     return;
1841 }
1842
1843
1844 /////////////////////////////////////////////////////////////////////////////////////
1845 /*
1846  * FIXME: one of the most time consuming routines (according to gprof on r112)
1847  */
1848 /**
1849  * @brief Calculate freqs q.f[i][a] and transitions q.tr[i][a] (a=MM,MI,MD) with pos-specific subalignments
1850  * Pos-specific weights are calculated like in "GetPositionSpecificWeights()"
1851  */
1852 void
1853 Alignment::Amino_acid_frequencies_and_transitions_from_M_state(HMM& q, char* in)
1854 {
1855   // Calculate position-dependent weights wi[k] for each i.
1856   // For calculation of weights in column i use sub-alignment
1857   // over sequences which have a *residue* in column i (no gap, no end gap)
1858   // and over columns where none of these sequences has an end gap.
1859   // This is done by updating the arrays n[j][a] at each step i-1->i while letting i run from 1 to L.
1860   // n[j][a] = number of occurences of amino acid a at column j of the subalignment,
1861   // => only columns with n[j][ENDGAP]=0 are contained in the subalignment!
1862   // If no sequences enter or leave the subalignment at the step i-1 -> i (i.e. change=0)
1863   // then the old values wi[k], Neff[i-1], and ncol are used for the new position i.
1864   // Index a can be an amino acid (0-19), ANY=20, GAP=21, or ENDGAP=22
1865
1866   int k; // index of sequence
1867   int i,j; // position in alignment
1868   int a; // amino acid (0..19)
1869   int naa; // number of different amino acids
1870   int** n; // n[j][a] = number of seq's with some residue at column i AND a at position j
1871   //float wi[MAXSEQ]; // weight of sequence k in column i, calculated from subalignment i
1872   float *wi=NULL; // weight of sequence k in column i, calculated from subalignment i
1873   //float Neff[MAXRES]; // diversity of subalignment i
1874   float *Neff = new float[par.maxResLen]; // diversity of subalignment i
1875   int nseqi=0; // number of sequences in subalignment i
1876   int ncol=0; // number of columns j that contribute to Neff[i]
1877   char change; // has the set of sequences in subalignment changed? 0:no 1:yes
1878   float fj[NAA+3]; // to calculate entropy
1879   float sum;
1880
1881   wi = new float[N_in+2];
1882
1883   // Global weights?
1884   if (par.wg==1)
1885     for (k=0; k<N_in; k++)
1886         wi[k]=wg[k];
1887
1888   // Initialization
1889   q.Neff_HMM=0.0f;
1890   Neff[0]=0.0; // if the first column has no residues (i.e. change==0), Neff[i]=Neff[i-1]=Neff[0]
1891   n = new int*[L+2];
1892 //#if 0
1893 #ifdef HAVE_OPENMP
1894 #pragma omp parallel for schedule(static) private(a)
1895 #endif
1896   for (j=1; j<=L; j++) {
1897           n[j]=new(int[NAA+3]);
1898           for (a=0; a<NAA+3; a++)
1899                   n[j][a]=0;
1900   }
1901
1902
1903   //////////////////////////////////////////////////////////////////////////////////////////////
1904   // Main loop through alignment columns
1905   for (i=1; i<=L; i++) // Calculate wi[k] at position i as well as Neff[i]
1906   {
1907       if (par.wg==0)
1908       {
1909           change=0;
1910           // Check all sequences k and update n[j][a] and ri[j] if necessary
1911           for (k=0; k<N_in; k++)
1912           {
1913               if (!in[k]) continue;
1914               if (X[k][i-1]>=ANY && X[k][i]<ANY)
1915               { // ... if sequence k was NOT included in i-1 and has to be included for column i
1916                   change=1;
1917                   nseqi++;
1918 //#if 0
1919 #ifdef HAVE_OPENMP
1920 #pragma omp parallel for schedule(static)
1921 #endif
1922                   for (int j=1; j<=L; j++)
1923                       n[j][ (int)X[k][j]]++;
1924               }
1925               else if (X[k][i-1]<ANY && X[k][i]>=ANY)
1926               { // ... if sequence k WAS included in i-1 and has to be thrown out for column i
1927                   change=1;
1928                   nseqi--;
1929 //#if 0
1930 #ifdef HAVE_OPENMP
1931 #pragma omp parallel for schedule(static)
1932 #endif
1933                   for (int j=1; j<=L; j++)
1934                       n[j][ (int)X[k][j]]--;
1935               }
1936           } //end for (k)
1937           nseqs[i]=nseqi;
1938
1939           // If subalignment changed: update weights wi[k] and Neff[i]
1940           if (change)
1941           {
1942               // Initialize weights and numbers of residues for subalignment i
1943               ncol=0;
1944               for (k=0; k<N_in; k++)
1945                   wi[k]=1E-8; // for pathological alignments all wi[k] can get 0; /* MR1 */
1946
1947               // sum wi[k] over all columns j and sequences k of subalignment
1948 //#if 0
1949 #ifdef HAVE_OPENMP
1950 #pragma omp parallel for schedule(static) private(naa, a, k)
1951 #endif
1952               for (j=1; j<=L; j++)
1953               {
1954                   // do at least a fraction MAXENDGAPFRAC of sequences in subalignment contain an end gap in j?
1955                   if (n[j][ENDGAP]>MAXENDGAPFRAC*nseqi) continue;
1956                   naa=0;
1957                   for (a=0; a<20; a++)
1958                       if(n[j][a])
1959                           naa++;
1960                   if (naa==0) continue;
1961 //#if 0
1962 #ifdef HAVE_OPENMP
1963 #pragma omp atomic
1964 #endif
1965                   ncol++;
1966                   for (k=0; k<N_in; k++)
1967                   {
1968                       if (in[k] && X[k][i]<ANY && X[k][j]<ANY)
1969                       {
1970                           // if (!n[j][ (int)X[k][j]]) {fprintf(stderr,"Error: Mi=%i: n[%i][X[%i]]=0! (X[%i]=%i)\n",i,j,k,k,X[k][j]);}
1971 //#if 0
1972 #ifdef HAVE_OPENMP
1973 #pragma omp atomic
1974 #endif
1975                           wi[k]+=1.0/float(n[j][ (int)X[k][j] ]*naa);
1976                       }
1977                   }
1978               } // end for (j)
1979
1980               // Check whether number of columns in subalignment is sufficient
1981               if (ncol<NCOLMIN)
1982                   // Take global weights
1983                   for (k=0; k<N_in; k++)
1984                       if(in[k] && X[k][i]<ANY)
1985                           wi[k]=wg[k];
1986                       else wi[k]=0.0;
1987
1988               // Calculate Neff[i]
1989               Neff[i]=0.0;
1990 //#if 0
1991 #ifdef HAVE_OPENMP
1992 #pragma omp parallel for schedule(static) private(a, k, fj)
1993 #endif
1994               for (j=1; j<=L; j++)
1995               {
1996                   // do at least a fraction MAXENDGAPFRA of sequences in subalignment contain an end gap in j?
1997                   if (n[j][ENDGAP]>MAXENDGAPFRAC*nseqi) continue;
1998                   for (a=0; a<20; a++) fj[a]=0;
1999                   for (k=0; k<N_in; k++)
2000                       if (in[k] && X[k][i]<ANY && X[k][j]<ANY)
2001                           fj[ (int)X[k][j] ]+=wi[k];
2002                   NormalizeTo1(fj,NAA);
2003                   for (a=0; a<20; a++)
2004                       if (fj[a]>1E-10)
2005 //#if 0
2006 #ifdef HAVE_OPENMP
2007 #pragma omp atomic
2008 #endif
2009                           Neff[i]-=fj[a]*fast_log2(fj[a]);
2010               }
2011               if (ncol>0)
2012                   Neff[i]=pow(2.0,Neff[i]/ncol);
2013               else
2014                   Neff[i]=1.0;
2015           } // end change
2016
2017           else //no update was necessary; copy values for i-1
2018           {
2019               Neff[i]=Neff[i-1];
2020           }
2021       } // end par.wg==0
2022
2023
2024       // Calculate amino acid frequencies q.f[i][a] from weights wi[k]
2025       for (a=0; a<20; a++)
2026           q.f[i][a]=0;
2027       for (k=0; k<N_in; k++)
2028           if (in[k])
2029               q.f[i][ (int)X[k][i] ]+=wi[k];
2030       NormalizeTo1(q.f[i],NAA,pb);
2031
2032       // Calculate transition probabilities from M state
2033       q.tr[i][M2M]=q.tr[i][M2D]=q.tr[i][M2I]=0.0;
2034       for (k=0; k<N_in; k++) //for all sequences
2035       {
2036           if (!in[k]) continue;
2037           //if input alignment is local ignore transitions from and to end gaps
2038           if (X[k][i]<ANY) //current state is M
2039           {
2040               if (I[k][i]) //next state is I
2041                   q.tr[i][M2I]+=wi[k];
2042               else if (X[k][i+1]<=ANY) //next state is M
2043                   q.tr[i][M2M]+=wi[k];
2044               else if (X[k][i+1]==GAP) //next state is D
2045                   q.tr[i][M2D]+=wi[k];
2046           }
2047       } // end for(k)
2048       // Normalize and take log
2049       sum = q.tr[i][M2M]+q.tr[i][M2I]+q.tr[i][M2D]+FLT_MIN;
2050       q.tr[i][M2M]=Log2(q.tr[i][M2M]/sum);
2051       q.tr[i][M2I]=Log2(q.tr[i][M2I]/sum);
2052       q.tr[i][M2D]=Log2(q.tr[i][M2D]/sum);
2053
2054       // for (k=0; k<N_in; k++) if (in[k]) w[k][i]=wi[k];
2055   } // end loop through alignment columns i
2056
2057   // DD TODO:fill in all the missing Neff values
2058
2059   //////////////////////////////////////////////////////////////////////////////////////////////
2060
2061   delete[](wi); wi=NULL;
2062   // delete n[][]
2063   for (j=1; j<=L; j++){
2064     delete[](n[j]);
2065         (n[j]) = NULL;
2066   }
2067   delete[](n); (n) = NULL;
2068
2069   q.tr[0][M2M]=0;
2070   q.tr[0][M2I]=-100000;
2071   q.tr[0][M2D]=-100000;
2072   q.tr[L][M2M]=0;
2073   q.tr[L][M2I]=-100000;
2074   q.tr[L][M2D]=-100000;
2075   q.Neff_M[0]=99.999; // Neff_av[0] is used for calculation of transition pseudocounts for the start state
2076
2077   // Set emission probabilities of zero'th (begin) state and L+1st (end) state to background probabilities
2078   for (a=0; a<20; a++)
2079       q.f[0][a]=q.f[L+1][a]=pb[a];
2080
2081   // Assign Neff_M[i] and calculate average over alignment, Neff_M[0]
2082   if (par.wg==1)
2083   {
2084 //#if 0
2085 #ifdef HAVE_OPENMP
2086 #pragma omp parallel for schedule(static) private(a)
2087 #endif
2088       for (i=1; i<=L; i++)
2089       {
2090           float sum=0.0f;
2091           for (a=0; a<20; a++)
2092               if (q.f[i][a]>1E-10)
2093                   sum -= q.f[i][a]*fast_log2(q.f[i][a]);
2094 //#if 0
2095 #ifdef HAVE_OPENMP
2096 #pragma omp atomic
2097 #endif
2098           q.Neff_HMM+=pow(2.0,sum);
2099       }
2100       q.Neff_HMM/=L;
2101       float Nlim=fmax(10.0,q.Neff_HMM+1.0); // limiting Neff
2102       float scale=Log2((Nlim-q.Neff_HMM)/(Nlim-1.0)); // for calculating Neff for those seqs with inserts at specific pos
2103 //#if 0
2104 #ifdef HAVE_OPENMP
2105 #pragma omp parallel for schedule(static) private(k)
2106 #endif
2107       for (i=1; i<=L; i++)
2108       {
2109           float w_M=-1.0/N_filtered;
2110           for (k=0; k<N_in; k++)
2111               if (in[k] && X[k][i]<=ANY)
2112                   w_M+=wg[k];
2113           if (w_M<0)
2114               q.Neff_M[i]=1.0;
2115           else
2116               q.Neff_M[i] = Nlim - (Nlim-1.0)*fpow2(scale*w_M);
2117           // fprintf(stderr,"M i=%3i ncol=--- Neff_M=%5.2f Nlim=%5.2f w_M=%5.3f Neff_M=%5.2f\n",i,q.Neff_HMM,Nlim,w_M,q.Neff_M[i]);
2118       }
2119   }
2120   else
2121   {
2122       for (i=1; i<=L; i++)
2123       {
2124           q.Neff_HMM+=Neff[i];
2125           q.Neff_M[i]=Neff[i];
2126           if (q.Neff_M[i] == 0) {
2127               q.Neff_M[i] = 1;
2128           } /* MR1 */
2129       }
2130       q.Neff_HMM/=L;
2131   } // end par.wg==1
2132
2133   delete[] Neff; Neff = NULL;
2134
2135   return;
2136
2137 } /* this is the end of Alignment::Amino_acid_frequencies_and_transitions_from_M_state() */
2138
2139
2140 /////////////////////////////////////////////////////////////////////////////////////
2141 /**
2142  * @brief Calculate transitions q.tr[i][a] (a=DM,DD) with pos-specific subalignments
2143  */
2144 void
2145 Alignment::Transitions_from_I_state(HMM& q, char* in)
2146 {
2147     // Calculate position-dependent weights wi[k] for each i.
2148     // For calculation of weights in column i use sub-alignment
2149     // over sequences which have a INSERT in column i
2150     // and over columns where none of these sequences has an end gap.
2151     // This is done by calculating the arrays n[j][a] and rj[j] at each step i-1->i while letting i run from 1 to L.
2152     // n[j][a] = number of occurences of amino acid a at column j of the subalignment,
2153     // => only columns with n[j][ENDGAP]=0 are contained in the subalignment!
2154     // If no sequences enter or leave the subalignment at the step i-1 -> i (i.e. change=0)
2155     // then the old values wi[k], Neff[i-1], and ncol are used for the new position i.
2156     // Index a can be an amino acid (0-19), ANY=20, GAP=21, or ENDGAP=22
2157
2158     int k; // index of sequence
2159     int i,j; // position in alignment
2160     int a; // amino acid (0..19)
2161     int naa; // number of different amino acids
2162     int** n; // n[j][a] = number of seq's with some residue at column i AND a at position j
2163     //float wi[MAXSEQ]; // weight of sequence k in column i, calculated from subalignment i
2164     float *wi = NULL; // weight of sequence k in column i, calculated from subalignment i
2165     //float Neff[MAXRES]; // diversity of subalignment i
2166     float *Neff = new float[par.maxResLen]; // diversity of subalignment i
2167     int nseqi; // number of sequences in subalignment i
2168     int ncol; // number of columns j that contribute to Neff[i]
2169     float fj[NAA+3]; // to calculate entropy
2170     float sum;
2171     float Nlim=0.0; // only for global weights
2172     float scale=0.0; // only for global weights
2173
2174     wi = new float[N_in+2];
2175
2176     // Global weights?
2177     if (par.wg==1)
2178         {
2179             for (k=0; k<N_in; k++) wi[k]=wg[k];
2180             Nlim=fmax(10.0,q.Neff_HMM+1.0); // limiting Neff
2181             scale=Log2((Nlim-q.Neff_HMM)/(Nlim-1.0)); // for calculating Neff for those seqs with inserts at specific pos
2182         }
2183
2184     // Initialization
2185     n = new int*[L+2];
2186     for (j=1; j<=L; j++) n[j]=new(int[NAA+3]);
2187
2188     //////////////////////////////////////////////////////////////////////////////////////////////
2189     // Main loop through alignment columns
2190     for (i=1; i<=L; i++) // Calculate wi[k] at position i as well as Neff[i]
2191         {
2192             if (par.wg==0) // local weights?
2193                 {
2194
2195                     // Calculate n[j][a] and ri[j]
2196                     nseqi=0;
2197                     for (k=0; k<N_in; k++)
2198                         {
2199                             if (in[k] && I[k][i]>0)
2200                                 {
2201                                     if (nseqi==0) // Initialize only if inserts present! Otherwise O(L*L) even for single sequences!
2202                                         {
2203                                             // Initialization of n[j][a]
2204                                             for (j=1; j<=L; j++)
2205                                                 for (a=0; a<NAA+3; a++) n[j][a]=0;
2206                                         }
2207                                     nseqi++;
2208                                     for (int j=1; j<=L; j++) n[j][ (int)X[k][j]]++;
2209                                 }
2210                         } //end for (k)
2211                     nseqs[i]=nseqi;
2212
2213                     // If there is no sequence in subalignment j ...
2214                     if (nseqi==0)
2215                         {
2216                             ncol=0;
2217                             Neff[i]=0.0; // effective number of sequence = 0!
2218                             q.tr[i][I2M]=-100000;
2219                             q.tr[i][I2I]=-100000;
2220                             continue;
2221                         }
2222
2223                     // update weights wi[k] and Neff[i]
2224                     // if (1)
2225                     {
2226                         // Initialize weights and numbers of residues for subalignment i
2227                         ncol=0;
2228                         for (k=0; k<N_in; k++) wi[k]=0.0;
2229
2230                         // sum wi[k] over all columns j and sequences k of subalignment
2231                         for (j=1; j<=L; j++)
2232                             {
2233                                 if (n[j][ENDGAP]>MAXENDGAPFRAC*nseqi) continue;
2234                                 naa=0; for (a=0; a<20; a++) if(n[j][a]) naa++;
2235                                 if (naa==0) continue;
2236                                 ncol++;
2237                                 for (k=0; k<N_in; k++)
2238                                     {
2239                                         if (in[k] && I[k][i]>0 && X[k][j]<ANY)
2240                                             {
2241                                                 if (!n[j][ (int)X[k][j]]) {fprintf(stderr,"Error: Ii=%i: n[%i][X[%i]]=0! (X[%i]=%i)\n",i,j,k,k,X[k][j]);}
2242                                                 wi[k]+=1.0/float(n[j][ (int)X[k][j] ]*naa);
2243                                             }
2244                                     }
2245                             }
2246
2247                         // Check whether number of columns in subalignment is sufficient
2248                         if (ncol>=NCOLMIN)
2249                             // Take global weights
2250                             for (k=0; k<N_in; k++)
2251                                 if(in[k] && I[k][i]>0) wi[k]=wg[k]; else wi[k]=0.0;
2252
2253                         // Calculate Neff[i]
2254                         Neff[i]=0.0;
2255                         for (j=1; j<=L; j++)
2256                             {
2257                                 if (n[j][ENDGAP]>MAXENDGAPFRAC*nseqi) continue;
2258                                 for (a=0; a<20; a++) fj[a]=0;
2259                                 for (k=0; k<N_in; k++)
2260                                     if (in[k] && I[k][i]>0 && X[k][j]<ANY)
2261                                         fj[ (int)X[k][j] ]+=wi[k];
2262                                 NormalizeTo1(fj,NAA);
2263                                 for (a=0; a<20; a++)
2264                                     if (fj[a]>1E-10) Neff[i]-=fj[a]*fast_log2(fj[a]);
2265                             }
2266                         if (ncol>0) Neff[i]=pow(2.0,Neff[i]/ncol); else Neff[i]=1.0;
2267
2268                     }
2269                     // Calculate transition probabilities from I state
2270                     q.tr[i][I2M]=q.tr[i][I2I]=0.0;
2271                     for (k=0; k<N_in; k++) //for all sequences
2272                         {
2273                             if (in[k] && I[k][i]>0) //current state is I
2274                                 {
2275                                     q.tr[i][I2M]+=wi[k];
2276                                     q.tr[i][I2I]+=wi[k]*(I[k][i]-1);
2277                                 }
2278                         } // end for(k)
2279                 }
2280
2281             else // fast global weights?
2282                 {
2283                     float w_I=-1.0/N_filtered;
2284                     ncol=0;
2285                     q.tr[i][I2M]=q.tr[i][I2I]=0.0;
2286                     // Calculate amino acid frequencies fj[a] from weights wg[k]
2287                     for (k=0; k<N_in; k++)
2288                         if (in[k] && I[k][i]>0)
2289                             {
2290                                 ncol++;
2291                                 w_I+=wg[k];
2292                                 q.tr[i][I2M]+=wi[k];
2293                                 q.tr[i][I2I]+=wi[k]*(I[k][i]-1);
2294                             }
2295                     if (ncol>0)
2296                         {
2297                             if (w_I<0) Neff[i]=1.0;
2298                             else Neff[i] = Nlim - (Nlim-1.0)*fpow2(scale*w_I);
2299                             // fprintf(stderr,"I i=%3i ncol=%3i Neff_M=%5.2f Nlim=%5.2f w_I=%5.3f Neff_I=%5.2f\n",i,ncol,q.Neff_HMM,Nlim,w_I,Neff[i]);
2300                         }
2301                     else
2302                         {
2303                             Neff[i]=0.0;
2304                             q.tr[i][I2M]=-100000;
2305                             q.tr[i][I2I]=-100000;
2306                             continue;
2307                         }
2308                 }
2309
2310             // Normalize and take log
2311             sum = q.tr[i][I2M]+q.tr[i][I2I];
2312             q.tr[i][I2M]=Log2(q.tr[i][I2M]/sum);
2313             q.tr[i][I2I]=Log2(q.tr[i][I2I]/sum);
2314
2315         }
2316     // end loop through alignment columns i
2317     //////////////////////////////////////////////////////////////////////////////////////////////
2318
2319     delete[](wi); wi = NULL;
2320     // delete n[][]
2321     for (j=1; j<=L; j++){
2322         delete[](n[j]); (n[j]) = NULL;
2323     }
2324     delete[](n); (n) = NULL;
2325
2326     q.tr[0][I2M]=0;
2327     q.tr[0][I2I]=-100000;
2328     q.tr[L][I2M]=0;
2329     q.tr[L][I2I]=-100000;
2330     q.Neff_I[0]=99.999;
2331
2332     // Assign Neff_I[i]
2333     for (i=1; i<=L; i++) // Calculate wi[k] at position i as well as Neff[i] and Neff[i]
2334         q.Neff_I[i]=Neff[i];
2335
2336     delete[] Neff; Neff = NULL;
2337     return;
2338
2339 } /* this is the end of Alignment::Transitions_from_I_state() */
2340
2341
2342 /////////////////////////////////////////////////////////////////////////////////////
2343 /**
2344  * @brief Calculate transitions q.tr[i][a] (a=DM,DD) with pos-specific subalignments
2345  */
2346 void
2347 Alignment::Transitions_from_D_state(HMM& q, char* in)
2348 {
2349     // Calculate position-dependent weights wi[k] for each i.
2350     // For calculation of weights in column i use sub-alignment
2351     // over sequences which have a DELETE in column i
2352     // and over columns where none of these sequences has an end gap.
2353     // This is done by updating the arrays n[j][a] and rj[j] at each step i-1->i while letting i run from 1 to L.
2354     // n[j][a] = number of occurences of index a at column j of the subalignment,
2355     // => only columns with n[j][ENDGAP]=0 are contained in the subalignment!
2356     // If no sequences enter or leave the subalignment at the step i-1 -> i (i.e. change=0)
2357     // then the old values wi[k], Neff[i-1], and ncol are used for the new position i.
2358     // Index a can be an amino acid (0-19), ANY=20, GAP=21, or ENDGAP=22
2359
2360     int k; // index of sequence
2361     int i,j; // position in alignment
2362     int a; // amino acid (0..19)
2363     int naa; // number of different amino acids
2364     int** n; // n[j][a] = number of seq's with some residue at column i AND a at position j
2365     //float wi[MAXSEQ]; // weight of sequence k in column i, calculated from subalignment i
2366     float *wi=NULL; // weight of sequence k in column i, calculated from subalignment i
2367     //float Neff[MAXRES]; // diversity of subalignment i
2368     float *Neff = new float[par.maxResLen]; // diversity of subalignment i
2369     int nseqi=0; // number of sequences in subalignment i (for DEBUGGING)
2370     int ncol=0; // number of columns j that contribute to Neff[i]
2371     char change; // has the set of sequences in subalignment changed? 0:no 1:yes
2372     float fj[NAA+3]; // to calculate entropy
2373     float sum;
2374     float Nlim=0.0; // only for global weights
2375     float scale=0.0; // only for global weights
2376
2377     wi = new float[N_in+2]; /* FIXME: FS */
2378     // Global weights?
2379     if (par.wg==1)
2380         {
2381             for (k=0; k<N_in; k++)
2382                 wi[k]=wg[k];
2383             Nlim=fmax(10.0,q.Neff_HMM+1.0); // limiting Neff
2384             scale=Log2((Nlim-q.Neff_HMM)/(Nlim-1.0)); // for calculating Neff for those seqs with dels at specific pos
2385         }
2386
2387     // Initialization
2388     n = new int*[L+2];
2389 //#if 0
2390 #ifdef HAVE_OPENMP
2391 #pragma omp parallel for schedule(static) private(a)
2392 #endif
2393     for (j=1; j<=L; j++) {
2394                 n[j]=new(int[NAA+3]);
2395         for (a=0; a<NAA+3; a++)
2396                         n[j][a]=0;
2397         }
2398
2399     //////////////////////////////////////////////////////////////////////////////////////////////
2400     // Main loop through alignment columns
2401     for (i=1; i<=L; i++) // Calculate wi[k] at position i as well as Neff[i]
2402     {
2403         if (par.wg==0) // if local weights
2404         {
2405             change=0;
2406             // Check all sequences k and update n[j][a] and ri[j] if necessary
2407             for (k=0; k<N_in; k++)
2408             {
2409                 if (!in[k]) continue;
2410                 if (X[k][i-1]!=GAP && X[k][i]==GAP)
2411                 { // ... if sequence k was NOT included in i-1 and has to be included for column i
2412                     change=1;
2413                     nseqi++;
2414 //#if 0
2415 #ifdef HAVE_OPENMP
2416 #pragma omp parallel for schedule(static)
2417 #endif
2418                     for (int j=1; j<=L; j++)
2419                         n[j][ (int)X[k][j]]++;
2420                 }
2421                 else if (X[k][i-1]==GAP && X[k][i]!=GAP)
2422                 { // ... if sequence k WAS included in i-1 and has to be thrown out for column i
2423                     change=1;
2424                     nseqi--;
2425 //#if 0
2426 #ifdef HAVE_OPENMP
2427 #pragma omp parallel for schedule(static)
2428 #endif
2429                     for (int j=1; j<=L; j++)
2430                         n[j][ (int)X[k][j]]--;
2431                 }
2432             } //end for (k)
2433             nseqs[i]=nseqi;
2434
2435             // If there is no sequence in subalignment j ...
2436             if (nseqi==0)
2437             {
2438                 ncol=0;
2439                 Neff[i]=0.0; // effective number of sequences = 0!
2440                 q.tr[i][D2M]=-100000;
2441                 q.tr[i][D2D]=-100000;
2442                 continue;
2443             }
2444
2445             // If subalignment changed: update weights wi[k] and Neff[i]
2446             if (change)
2447             {
2448                 // Initialize weights and numbers of residues for subalignment i
2449                 ncol=0;
2450                 for (k=0; k<N_in; k++)
2451                     wi[k]=0.0;
2452
2453                 // sum wg[k][i] over all columns j and sequences k of subalignment
2454 //#if 0
2455 #ifdef HAVE_OPENMP
2456 #pragma omp parallel for schedule(static) private(naa, a, k)
2457 #endif
2458                 for (j=1; j<=L; j++)
2459                 {
2460                     if (n[j][ENDGAP]>MAXENDGAPFRAC*nseqi) continue;
2461                     naa=0;
2462                     for (a=0; a<20; a++)
2463                         if(n[j][a])
2464                             naa++;
2465                     if (naa==0) continue;
2466 //#if 0
2467 #ifdef HAVE_OPENMP
2468 #pragma omp atomic
2469 #endif
2470                     ncol++;
2471                     for (k=0; k<N_in; k++)
2472                     {
2473                         if (in[k] && X[k][i]==GAP && X[k][j]<ANY)
2474                         {
2475                             if (!n[j][ (int)X[k][j]]) {fprintf(stderr,"Error: Di=%i: n[%i][X[%i]]=0! (X[%i]=%i)\n",i,j,k,k,X[k][j]);}
2476 //#if 0
2477 #ifdef HAVE_OPENMP
2478 #pragma omp atomic
2479 #endif
2480                             wi[k]+=1.0/float(n[j][ (int)X[k][j] ]*naa);
2481                         }
2482                     }
2483                 } // end for (j)
2484                 // Check whether number of columns in subalignment is sufficient
2485                 if (ncol<NCOLMIN)
2486                     // Take global weights
2487                     for (k=0; k<N_in; k++)
2488                         if(in[k] && X[k][i]==GAP)
2489                             wi[k]=wg[k];
2490                         else
2491                             wi[k]=0.0;
2492
2493                 // Calculate Neff[i]
2494                 Neff[i]=0.0;
2495 //#if 0
2496 #ifdef HAVE_OPENMP
2497 #pragma omp parallel for schedule(static) private(a, k, fj)
2498 #endif
2499                 for (j=1; j<=L; j++)
2500                 {
2501                     if (n[j][ENDGAP]>MAXENDGAPFRAC*nseqi) continue;
2502                     for (a=0; a<20; a++) fj[a]=0;
2503                     for (k=0; k<N_in; k++)
2504                         if (in[k] && X[k][i]==GAP && X[k][j]<ANY)
2505                             fj[ (int)X[k][j] ]+=wi[k];
2506                     NormalizeTo1(fj,NAA);
2507                     for (a=0; a<20; a++)
2508                         if (fj[a]>1E-10)
2509 //#if 0
2510 #ifdef HAVE_OPENMP
2511 #pragma omp atomic
2512 #endif
2513                             Neff[i]-=fj[a]*fast_log2(fj[a]);
2514                 }
2515                 if (ncol>0) Neff[i]=pow(2.0,Neff[i]/ncol); else Neff[i]=1.0;
2516
2517             }
2518
2519             else //no update was necessary; copy values for i-1
2520             {
2521                 Neff[i]=Neff[i-1];
2522             }
2523
2524             // Calculate transition probabilities from D state
2525             q.tr[i][D2M]=q.tr[i][D2D]=0.0;
2526             for (k=0; k<N_in; k++) //for all sequences
2527             {
2528                 if (in[k] && X[k][i]==GAP) //current state is D
2529                 {
2530                     if (X[k][i+1]==GAP) //next state is D
2531                         q.tr[i][D2D]+=wi[k];
2532                     else if (X[k][i+1]<=ANY) //next state is M
2533                         q.tr[i][D2M]+=wi[k];
2534                 }
2535             } // end for(k)
2536         }
2537
2538         else // fast global weights?
2539         {
2540             float w_D=-1.0/N_filtered;
2541             ncol=0;
2542             q.tr[i][D2M]=q.tr[i][D2D]=0.0;
2543             // Calculate amino acid frequencies fj[a] from weights wg[k]
2544             for (k=0; k<N_in; k++) //for all sequences
2545                 if (in[k] && X[k][i]==GAP) //current state is D
2546                 {
2547                     ncol++;
2548                     w_D+=wg[k];
2549                     if (X[k][i+1]==GAP) //next state is D
2550                         q.tr[i][D2D]+=wi[k];
2551                     else if (X[k][i+1]<=ANY) //next state is M
2552                         q.tr[i][D2M]+=wi[k];
2553                 }
2554             if (ncol>0)
2555             {
2556                 if (w_D<0) Neff[i]=1.0;
2557                 else Neff[i] = Nlim - (Nlim-1.0)*fpow2(scale*w_D);
2558                 // fprintf(stderr,"D i=%3i ncol=%3i Neff_M=%5.2f Nlim=%5.2f w_D=%5.3f Neff_D=%5.2f\n",i,ncol,q.Neff_HMM,Nlim,w_D,Neff[i]);
2559             }
2560             else
2561             {
2562                 Neff[i]=0.0; // effective number of sequences = 0!
2563                 q.tr[i][D2M]=-100000;
2564                 q.tr[i][D2D]=-100000;
2565                 continue;
2566             }
2567         }
2568
2569         // Normalize and take log
2570         sum = q.tr[i][D2M]+q.tr[i][D2D];
2571         q.tr[i][D2M]=Log2(q.tr[i][D2M]/sum);
2572         q.tr[i][D2D]=Log2(q.tr[i][D2D]/sum);
2573
2574     }
2575     // end loop through alignment columns i
2576     //////////////////////////////////////////////////////////////////////////////////////////////
2577
2578     q.tr[0][D2M]=0;
2579     q.tr[0][D2D]=-100000;
2580     q.Neff_D[0]=99.999;
2581
2582     // Assign Neff_D[i]
2583     for (i=1; i<=L; i++)  {
2584         q.Neff_D[i]=Neff[i];
2585         delete[](n[i]);
2586                 (n[i]) = NULL;
2587     }
2588
2589     delete[](n); (n) = NULL;
2590     delete[](wi); wi = NULL;/* FIXME: FS */
2591
2592     delete[] Neff; Neff = NULL;
2593     return;
2594
2595 } /* this is the end of Alignment::Transitions_from_D_state() */
2596
2597
2598
2599 /////////////////////////////////////////////////////////////////////////////////////
2600 /**
2601  * @brief Write alignment without insert states (lower case) to alignment file?
2602  */
2603 void
2604 Alignment::WriteWithoutInsertsToFile(char* alnfile)
2605 {
2606     if (v>=2) cout<<"Writing alignment to "<<alnfile<<"\n";
2607     FILE* alnf;
2608     if (!par.append) alnf = fopen(alnfile,"w"); else alnf = fopen(alnfile,"a");
2609     if (!alnf) OpenFileError(alnfile);
2610     // If alignment name is different from that of query: write name into commentary line
2611     if (strncmp(longname,sname[kfirst],DESCLEN-1)) fprintf(alnf,"#%s\n",longname);
2612     if (v>=2) cout<<"Writing alignment to "<<alnfile<<"\n";
2613     for (int k=0; k<N_in; k++)
2614         if (keep[k] || display[k]==KEEP_ALWAYS) // print if either in profile (keep[k]>0) or display is obligatory (display[k]==2)
2615             {
2616                 fprintf(alnf,">%s\n",sname[k]);
2617                 for (int i=1; i<=L; i++) fprintf(alnf,"%c",i2aa(X[k][i]));
2618                 fprintf(alnf,"\n");
2619             }
2620     fclose(alnf);
2621 }
2622
2623 /////////////////////////////////////////////////////////////////////////////////////
2624 // Write stored,filtered sequences WITH insert states (lower case) to alignment file?
2625 /////////////////////////////////////////////////////////////////////////////////////
2626 void Alignment::WriteToFile(char* alnfile, const char format[])
2627 {
2628     FILE* alnf;
2629     if (!par.append) alnf = fopen(alnfile,"w"); else alnf = fopen(alnfile,"a");
2630     if (!alnf) OpenFileError(alnfile);
2631     // If alignment name is different from that of query: write name into commentary line
2632     if (strncmp(longname,sname[kfirst],DESCLEN-1)) fprintf(alnf,"#%s\n",longname);
2633     if (!format || !strcmp(format,"a3m"))
2634         {
2635             if (v>=2) cout<<"Writing A3M alignment to "<<alnfile<<"\n";
2636             for (int k=0; k<N_in; k++)
2637                 if (keep[k] || display[k]==KEEP_ALWAYS) // print if either in profile (keep[k]>0) or display obligatory (display[k]==2)
2638                     fprintf(alnf,">%s\n%s\n",sname[k],seq[k]+1);
2639         }
2640     else // PSI-BLAST format
2641         {
2642             if (v>=2) cout<<"Writing PSI-BLAST-formatted alignment to "<<alnfile<<"\n";
2643             for (int k=kfirst; k<N_in; k++) // skip sequences before kfirst!!
2644                 if (keep[k] || display[k]==KEEP_ALWAYS) // print if either in profile (keep[k]>0) or display obligatory (display[k]==2)
2645                     {
2646                         strcut(sname[k]);
2647                         fprintf(alnf,"%-20.20s ",sname[k]);
2648                         // for (int i=1; i<=L; i++) fprintf(alnf,"%c",i2aa(X[k][i]));
2649                         // fprintf(alnf,"\n");
2650                         char* ptr=seq[k];
2651                         for (; *ptr!='\0'; ptr++)
2652                             if (*ptr==45 || (*ptr>=65 && *ptr<=90)) fprintf(alnf,"%c",*ptr);
2653                         fprintf(alnf,"\n");
2654                     }
2655         }
2656
2657     fclose(alnf);
2658 }
2659
2660
2661
2662 /*
2663  * FIXME: this function contains a reference to MAXSEQ & MAXCOL
2664  * however, this may not be accessed  (FS)
2665  */
2666 /////////////////////////////////////////////////////////////////////////////////////
2667 /**
2668  * @brief Read a3m slave alignment of hit from file and merge into (query) master alignment
2669  */
2670 void
2671 Alignment::MergeMasterSlave(Hit& hit, char ta3mfile[])
2672 {
2673  Alignment Tali;
2674  char* cur_seq = new(char[MAXCOL]); // Sequence currently read in
2675  int maxcol=MAXCOL;
2676  int l,ll; // position in unaligned template (T) sequence Tali.seq[l]
2677  int i; // counts match states in query (Q) HMM
2678  int j; // counts match states in T sequence Tali.seq[l]
2679  int h; // position in aligned T sequence cur_seq[h]
2680  int k; // sequence index
2681  char c; //
2682  printf("****************%s:%s:%d: did get into MergeMasterSlave\n", __FUNCTION__, __FILE__, __LINE__);
2683  if (v>=3) printf("Merging %s to query alignment\n",ta3mfile);
2684
2685  // If par.append==1 do not print query alignment
2686  if (par.append) for (k=0; k<N_in; k++) keep[k]=display[k]=KEEP_NOT;
2687
2688  // Read template alignment into Tali
2689  FILE* ta3mf=fopen(ta3mfile,"r");
2690  if (!ta3mf) OpenFileError(ta3mfile);
2691  Tali.Read(ta3mf,ta3mfile);
2692  fclose(ta3mf);
2693
2694  // Filter Tali alignment
2695  Tali.Compress(ta3mfile);
2696  N_filtered = Tali.Filter(par.max_seqid,par.coverage,par.qid,par.qsc,par.Ndiff);
2697
2698  // Record imatch[j]
2699  int* imatch=new int[hit.j2+1];
2700  int step = hit.nsteps;
2701  for (j=hit.j1; j<=hit.j2; j++)
2702  {
2703  // Advance to position of next T match state j
2704  while (hit.j[step]<j) step--;
2705  imatch[j] = hit.i[step];
2706 // printf("step=%-3i i=%-3i j=%-3i\n",step,imatch[j],j);
2707  }
2708
2709  // Determine number of match states of Qali
2710  for (L=0,l=1; seq[kfirst][l]>'\0'; l++)
2711  if ((seq[kfirst][l]>='A' && seq[kfirst][l]<='Z') || seq[kfirst][l]=='-') L++;
2712
2713  // For each sequence in T alignment: align to Qali
2714  for (k=0; k<Tali.N_in; k++)
2715  {
2716  if (!Tali.keep[k]) continue;
2717  if (N_in>=MAXSEQ)
2718  {
2719  fprintf(stderr,"WARNING in %s: maximum number of %i sequences exceeded while reading %s. Skipping all following sequences\n",program_name,MAXSEQ,ta3mfile);
2720  break;
2721  }
2722  cur_seq[0]=' '; // 0'th position not used
2723
2724  // Add the hit.i1-1 left end gaps to aligned sequence
2725  for (h=1; h<hit.i1; h++) cur_seq[h]='-';
2726
2727  // Advance to match state hit.j1 of Tali.seq[k]
2728  for (j=0, l=1; (c=Tali.seq[k][l])>'\0'; l++)
2729  if ((c>='A' && c<='Z') || c=='-') // match state at position l?
2730  if ((++j)==hit.j1) break; // yes: increment j. Reached hit,j1? yes: break
2731
2732  if (j<hit.j1)
2733  {printf("Error: did not find %i match states in sequence %i of %s. Sequence:\n%s\n",hit.j1,k,Tali.name,Tali.seq[k]); exit(1);}
2734
2735  // Write first match state to cur_seq
2736  int iprev=hit.i1; // index of previous query match state
2737  int lprev=l; // previous T match state in Tali.seq[k][l]
2738  cur_seq[h++] = Tali.seq[k][l]; // first column of alignment is Match-Match state
2739
2740  // For each further match state j in alignment
2741  step = hit.nsteps;
2742  for (j=hit.j1+1; j<=hit.j2; j++)
2743  {
2744  // Advance to position of next T match state j
2745  i=imatch[j];
2746
2747  // Advance to position of next T match state j
2748  while ((c=Tali.seq[k][++l])>'\0' && ((c>='a' && c<='z') || c=='.')) ;
2749
2750  int di=i-iprev; // number of Match states in Q between T match state j-1 and j
2751  int dl=l-lprev; // 1 + number of inserted residues in T sequence between T match state j-1 and j
2752  if (di==1)
2753  {
2754  // One Q match state for one T match state (treated as special case for speed reasons)
2755  // i: i-1 i di=1
2756  // Q: XXXXXX.....XXXXXX
2757  // T: YYYYYYyyyyyYYYYYY
2758  // j: j-1 j
2759  // l: lprev l dl=6
2760
2761  // Inserts in lower case
2762  for (ll=lprev+1; ll<l; ll++)
2763  if (Tali.seq[k][ll]!='-' && Tali.seq[k][ll]!='.') cur_seq[h++] = lwrchr(Tali.seq[k][ll]);
2764
2765  // Template Match state -> upper case
2766  cur_seq[h++] = Tali.seq[k][ll];
2767  }
2768  else if (di==0)
2769  {
2770  // Gap in query: no Q match state for on T match state (special case for speed reasons)
2771  // i: i-1 i-1 di=0
2772  // Q: XXXXXX.....~~~XXX
2773  // T: YYYYYYyyyyyYYYYYY
2774  // j: j-1 j
2775  // l: lprev l dl=6
2776
2777  // All T residues (including T match state) in lower case
2778  for (ll=lprev+1; ll<=l; ll++)
2779  if (Tali.seq[k][ll]!='-' && Tali.seq[k][ll]!='.') cur_seq[h++] = lwrchr(Tali.seq[k][ll]);
2780  }
2781  else if (di>=dl)
2782  {
2783  // More Match states in Q than Inserts in the T sequence
2784  // => half T inserts y left, half right-aligned in uc, gaps to fill up
2785  // Number of T insert residues to be left-aligned: (int)(dl/2)
2786  // i: iprev i di=7
2787  // Q: XXXXXXXXXXXXXXXXXX
2788  // T: YYYYYYYyyy-yyYYYYY
2789  // j: j-1 j
2790  // l: lprev l dl=6
2791
2792  // Add left-bounded template residues
2793  for (ll=lprev+1; ll<=lprev+(int)(dl/2); ll++)
2794  cur_seq[h++]=uprchr(Tali.seq[k][ll]);
2795
2796  // Add central gaps
2797  for (int gap=1; gap<=di-dl; gap++) cur_seq[h++]='-';
2798
2799  // Add right-bounded residues
2800  for (; ll<=l; ll++)
2801  cur_seq[h++]=uprchr(Tali.seq[k][ll]);
2802  }
2803  else if (di<dl)
2804  {
2805  // Fewer Match states in Q than inserts in T sequence
2806  // => half of available space di for left- half for right-aligned T inserts, rest in lc
2807  // number of T inserts to be left-aligned in uc: (int)(di/2),
2808  // i: iprev i di=5
2809  // Q: XXXXXXXXX.XXXXXXX
2810  // T: YYYYYYYyyyyyYYYYY
2811  // j: j-1 j
2812  // l: lprev l dl=6
2813
2814  // Add left-bounded template residues
2815  for (ll=lprev+1; ll<=lprev+(int)(di/2); ll++)
2816  cur_seq[h++]=uprchr(Tali.seq[k][ll]);
2817
2818  // Add central inserts
2819  for (int ins=1; ins<=dl-di; ins++,ll++)
2820  if (Tali.seq[k][ll]!='-' && Tali.seq[k][ll]!='.') cur_seq[h++] = lwrchr(Tali.seq[k][ll]);
2821
2822  // Add right-bounded residues
2823  for (; ll<=l; ll++)
2824  cur_seq[h++]=uprchr(Tali.seq[k][ll]);
2825  }
2826 // printf("i=%-3i j=%-3i l=%-3i cur_seq=%s\n",i,j,l,cur_seq);
2827
2828  iprev=i; lprev=l;
2829  if (h>=maxcol-1000) // too few columns? Reserve double space
2830  {
2831  char* new_seq=new char[2*maxcol];
2832  strncpy(new_seq,cur_seq,maxcol); //////// check: maxcol-1 ????
2833  delete[](cur_seq); (cur_seq) = NULL;
2834  cur_seq=new_seq;
2835  maxcol*=2;
2836  }
2837  }
2838
2839  // Add the remaining gaps '-' to the end of the template sequence
2840  for (i=hit.i2+1; i<=L; i++) cur_seq[h++]='-';
2841  cur_seq[h++]='\0';
2842
2843  keep[N_in] = display[N_in] = KEEP_CONDITIONALLY;
2844  seq[N_in]=new char[h];
2845  if (!seq[N_in]) MemoryError("array for input sequences");
2846  strcpy(seq[N_in],cur_seq);
2847  X[N_in]=new char[h];
2848  if (!X[N_in]) MemoryError("array for input sequences");
2849  I[N_in]=new short unsigned int[h];
2850  if (!I[N_in]) MemoryError("array for input sequences");
2851  sname[N_in]=new char[strlen(Tali.sname[k])+1];
2852  if (!sname[N_in]) MemoryError("array for input sequences");
2853  strcpy(sname[N_in],Tali.sname[k]);
2854  N_in++;
2855
2856 // printf("k=%-3i %s\n",k,Tali.seq[k]);
2857 // printf("Query %s\n",seq[kfirst]);
2858 // printf("k=%-3i %s\n\n",k,cur_seq);
2859
2860  } // end for (k)
2861
2862 // printf("N_in=%-5i HMM=%s with %i sequences\n",N_in,ta3mfile,N_filtered);
2863
2864  delete[] cur_seq; cur_seq = NULL;
2865  delete[] imatch; imatch = NULL;
2866  delete[] ksort; ksort=NULL; // if ksort already existed it will be to short for merged alignment
2867  delete[] first; first=NULL; // if first already existed it will be to short for merged alignment
2868  delete[] last; last=NULL; // if last already existed it will be to short for merged alignment
2869
2870 } /* this is the end of Alignment::MergeMasterSlave() */
2871
2872
2873 /////////////////////////////////////////////////////////////////////////////////////
2874 /**
2875  * @brief Add a sequence to Qali
2876  */
2877 void
2878 Alignment::AddSequence(char Xk[], int Ik[])
2879 {
2880  int i; // position in query and target
2881  if (L<=0) InternalError("L is not set in AddSequence()");
2882  X[N_in]=new char[L+2];
2883  for (i=0; i<=L+1; i++) X[N_in][i]=Xk[i];
2884  if (Ik==NULL)
2885  for (i=0; i<=L+1; i++) I[N_in][i]=0;
2886  else
2887  for (i=0; i<=L+1; i++) I[N_in][i]=Ik[i];
2888  N_in++;
2889 }
2890
2891
2892 /////////////////////////////////////////////////////////////////////////////////////
2893 /**
2894  * @brief Determine matrix of position-specific weights w[k][i] for multiple alignment
2895  * Pos-specific weights are calculated like in "Amino_acid_frequencies_and_transitions_from_M_state()"
2896  */
2897 void
2898 Alignment::GetPositionSpecificWeights(float* w[])
2899 {
2900  // Calculate position-dependent weights wi[k] for each i.
2901  // For calculation of weights in column i use sub-alignment
2902  // over sequences which have a *residue* in column i (no gap, no end gap)
2903  // and over columns where none of these sequences has an end gap.
2904  // This is done by updating the arrays n[j][a] at each step i-1->i while letting i run from 1 to L.
2905  // n[j][a] = number of occurences of amino acid a at column j of the subalignment,
2906  // => only columns with n[j][ENDGAP]=0 are contained in the subalignment!
2907  // If no sequences enter or leave the subalignment at the step i-1 -> i (i.e. change=0)
2908  // then the old values w[k][i] and ncol are used for the new position i.
2909  // Index a can be an amino acid (0-19), ANY=20, GAP=21, or ENDGAP=22
2910
2911  char* in=keep; // to keep the code similar to Amino_acid_frequencies_and_transitions_from_M_state()
2912  int k; // index of sequence
2913  int i,j; // position in alignment
2914  int a; // amino acid (0..19)
2915  int naa; // number of different amino acids
2916  int** n; // n[j][a] = number of seq's with some residue at column i AND a at position j
2917  int nseqi=0; // number of sequences in subalignment i
2918  int ncol=0; // number of columns j that contribute to Neff[i]
2919  char change; // has the set of sequences in subalignment changed? 0:no 1:yes
2920
2921
2922  // Global weights?
2923  if (par.wg==1)
2924  {
2925  for (k=0; k<N_in; k++)
2926  for (i=1; i<=L; i++) w[k][i]=wg[k];
2927  }
2928  else
2929  {
2930
2931  // Initialization
2932  n = new int*[L+2];
2933  for (j=1; j<=L; j++) n[j]=new(int[NAA+3]);
2934  for (j=1; j<=L; j++)
2935  for (a=0; a<NAA+3; a++) n[j][a]=0;
2936
2937  //////////////////////////////////////////////////////////////////////////////////////////////
2938  // Main loop through alignment columns
2939  for (i=1; i<=L; i++) // Calculate w[k][i]
2940  {
2941  change=0;
2942  // Check all sequences k and update n[j][a] and ri[j] if necessary
2943  for (k=0; k<N_in; k++)
2944  {
2945  if (!in[k]) continue;
2946  if (X[k][i-1]>=ANY && X[k][i]<ANY)
2947  { // ... if sequence k was NOT included in i-1 and has to be included for column i
2948  change=1;
2949  nseqi++;
2950  for (int j=1; j<=L; j++) n[j][ (int)X[k][j]]++;
2951  }
2952  else if (X[k][i-1]<ANY && X[k][i]>=ANY)
2953  { // ... if sequence k WAS included in i-1 and has to be thrown out for column i
2954  change=1;
2955  nseqi--;
2956  for (int j=1; j<=L; j++) n[j][ (int)X[k][j]]--;
2957  }
2958  } //end for (k)
2959  nseqs[i]=nseqi;
2960
2961  // If subalignment changed: update weights w[k][i] and Neff[i]
2962  if (change)
2963  {
2964  // Initialize weights and numbers of residues for subalignment i
2965  ncol=0;
2966  for (k=0; k<N_in; k++) w[k][i]=0.0;
2967
2968  // sum wi[k] over all columns j and sequences k of subalignment
2969  for (j=1; j<=L; j++)
2970  {
2971  // do at least a fraction MAXENDGAPFRAC of sequences in subalignment contain an end gap in j?
2972  if (n[j][ENDGAP]>MAXENDGAPFRAC*nseqi) continue;
2973  naa=0; for (a=0; a<20; a++) if(n[j][a]) naa++;
2974  if (naa==0) continue;
2975  ncol++;
2976  for (k=0; k<N_in; k++)
2977  {
2978  if (in[k] && X[k][i]<ANY && X[k][j]<ANY)
2979  {
2980 // if (!n[j][ (int)X[k][j]]) {fprintf(stderr,"Error: Mi=%i: n[%i][X[%i]]=0! (X[%i]=%i)\n",i,j,k,k,X[k][j]);}
2981  w[k][i]+=1.0/float(n[j][ (int)X[k][j] ]*naa);
2982  }
2983  }
2984  }
2985
2986  // Check whether number of columns in subalignment is sufficient
2987  if (ncol<NCOLMIN)
2988  // Take global weights
2989  for (k=0; k<N_in; k++)
2990  if(in[k]) {if(X[k][i]<ANY) w[k][i]=wg[k]; else w[k][i]=0.0;}
2991  }
2992  }
2993  // end loop through alignment columns i
2994  ///////////////////////////////////////////////////////////////////////
2995
2996  // delete n[][]
2997  for (j=1; j<=L; j++){
2998  delete[](n[j]); (n[j]) = NULL;
2999  }
3000  delete[](n); (n) = NULL;
3001
3002  }
3003  return;
3004 }
3005
3006 #ifdef CLUSTALO
3007 /* @* Transfer
3008  *
3009  * take sequence data from Clustal and transfer it into
3010  * hhalign accessible information (structure/class)
3011  *
3012  * Note that hhalign does not see all sequences/profiles
3013  * but only sequences that are elements of the 2 profiles
3014  * to be aligned.
3015  *
3016  * References to the required sequences are passed into hhalign
3017  * through auxilliary pointers that are shallow copies of the
3018  * sequence/profile data available to Clustal.
3019  *
3020  * Re-allocating memory for these auxilliary pointers
3021  * would be desaterous, as it might detach the memory
3022  * seen by Clustal.
3023  */
3024 void
3025 Alignment::Transfer(char **ppcProf, int iCnt){
3026
3027     /* @<variables local to Transfer@> */
3028     int iLen; /* length of profile */
3029     int k; /* generic iterator */
3030
3031     /* @<initialisation@> */
3032     N_in = iCnt;
3033     N_filtered = N_ss = 0;
3034     kss_dssp = ksa_dssp = kss_pred = kss_conf = -1;
3035     kfirst = 0;
3036     strcpy(longname, "unknown_long_seq_name");
3037     strcpy(name, "unknown_seq_name");
3038     strcpy(file, "unknown_file_name");
3039     n_display = iCnt;
3040
3041     /* @<determine length of profile@>
3042        all sequences in profile should have same length,
3043        so only do it for 1st */
3044     for (iLen = 0; '\0' != ppcProf[0][iLen]; iLen++);
3045
3046     /* @<allocate memory for sequences etc@> */
3047     for (k = 0; k < iCnt; k++){
3048 #define GOOD_MEASURE 1000 /* Temporary -- can be removed once rest in place */
3049         I[k] = new short unsigned int[iLen+2+GOOD_MEASURE];
3050         X[k] = new char[iLen+2+GOOD_MEASURE];
3051         seq[k] = new char[iLen+2+GOOD_MEASURE];
3052         seq[k][0] = ' ';
3053         seq[k][1] = '\0';
3054         if (NULL == ppcProf[k]){
3055             printf("%s:%d: Arena[%d]=NULL, cnt=%d\n", __FILE__, __LINE__, k, iCnt);
3056             exit(-1);
3057         }
3058         strcat(seq[k], ppcProf[k]);
3059         keep[k] = KEEP_CONDITIONALLY;
3060         display[k] = KEEP_CONDITIONALLY;
3061         sname[k] = new(char[GOOD_MEASURE]);
3062         strcpy(sname[k], "unknown_sname");
3063     } /* (0 <= k < iCnt) */
3064     /* FIXME: Soeding always makes 1st sequence permanent */
3065     /*keep[0] = KEEP_ALWAYS;
3066       display[k] = KEEP_ALWAYS;*/
3067 #if 1
3068     /* Believe that the first and last positions are
3069        most important in stability of this algorithm.
3070        Must make sure that at least 2 sequences with
3071        residues in these positions are kept.
3072        Think any sequence will do, but better to keep
3073        the one with the longest 'contig'
3074     */
3075     int iSeq; /* sequence iterator */
3076     int iHeadLen = 0, iHeadID = -1; /* length & ID of longest head contig */
3077     int iTailLen = 0, iTailID = -1; /* length & ID of longest head contig */
3078     int iCont = -1;
3079     char *pcFind = NULL;
3080
3081 #if 0
3082     printf("%s:%s:%d: NEW PROFILE (%d seq) ================\n",
3083            __FUNCTION__, __FILE__, __LINE__, iCnt);
3084 #endif
3085     for (iSeq = 0; iSeq < iCnt; iSeq++){
3086 #if 0
3087         printf("%s:%s:%d: consider seq %d ------------------\n",
3088                __FUNCTION__, __FILE__, __LINE__, iSeq);
3089 #endif
3090         pcFind = strchr(&seq[iSeq][1], '-');
3091         if (NULL == pcFind){
3092             /* no gap at all in this sequences, spans entire profile */
3093             iHeadID = iTailID = iSeq;
3094             iHeadLen = iTailLen = iLen;
3095             break;
3096         }
3097         iCont = (int)(pcFind - &seq[iSeq][1]);
3098         if (iCont > iHeadLen){
3099             iHeadLen = iCont;
3100             iHeadID  = iSeq;
3101         }
3102         pcFind = strrchr(seq[iSeq], '-');
3103         iCont = iLen - (int)(pcFind - seq[iSeq]);
3104         if (iCont > iTailLen){
3105             iTailLen = iCont;
3106             iTailID  = iSeq;
3107         }
3108
3109 #if 0
3110         printf("%s:%s:%d: seq %3d: len = %d(%d) %s\n",
3111                __FUNCTION__, __FILE__, __LINE__, iSeq, iCont, iLen, seq[iSeq]);
3112 #endif
3113     } /* 0 <= iSeq < iCnt */
3114 #if 0
3115     printf("%s:%s:%d: seq %d is winner with head contig of %d, seq %d tail contig of %d\n"
3116            , __FUNCTION__, __FILE__, __LINE__, iHeadID, iHeadLen, iTailID, iTailLen);
3117 #endif
3118     if ( (-1 == iHeadID) || (-1 == iTailID) ){
3119         printf("%s:%s:%d: profile has no leading and/or trailing residues (h=%d:t=%d:#=%d)\n",
3120                __FUNCTION__, __FILE__, __LINE__, iHeadID, iTailID, iCnt);
3121     }
3122     else{
3123         keep[iHeadID] = KEEP_ALWAYS;
3124         keep[iTailID] = KEEP_ALWAYS;
3125     }
3126 #endif
3127     /* @= */
3128     return;
3129
3130 } /* this is the end of Transfer() */
3131 #endif
3132
3133 #ifdef CLUSTALO
3134 /* @* Alignment::ClobberGlobal (eg: qali)
3135  *
3136  * Note: originally hhalign() was stand-alone code,
3137  * there are a couple of GLOBAL (!) variables,
3138  * which would have been destroyed on exit.
3139  * However, now there is no 'exit' from hhalign(),
3140  * and on re-entry the global variable must be clean again.
3141  */
3142 void
3143 Alignment::ClobberGlobal(void){
3144
3145  /* @<essentials@>
3146  these are essential to re-set (as some of them are used as flags) */
3147  for(int k=0; k<N_in; k++)
3148  {
3149  delete[] sname[k]; sname[k] = NULL;
3150  delete[] seq[k]; seq[k] = NULL;
3151  delete[] X[k]; X[k] = NULL;
3152  delete[] I[k]; I[k] = NULL;
3153  }
3154  delete[] nres; nres = NULL;
3155  delete[] first; first = NULL;
3156  delete[] last; last = NULL;
3157  delete[] ksort; ksort = NULL;
3158  N_in = N_filtered = n_display = 0;
3159  L = 0;
3160  kss_dssp = ksa_dssp = kss_pred = kss_conf = kfirst = -1;
3161
3162  /* @<re-set but keep memory@>
3163  do not free the memory but re-set content */
3164  longname[0] = '\0'; //delete[] longname; longname = NULL;
3165  keep[0] = '\0'; //delete[] keep; keep = NULL;
3166  display[0] = '\0'; //delete[] display; display = NULL;
3167  wg[0] = 0; //delete[] wg; wg = NULL;
3168  nseqs[0] = 0; //delete[] nseqs; nseqs = NULL;
3169  name[0]='\0';
3170  fam[0]='\0';
3171  file[0]='\0';
3172  //delete[] sname; sname = NULL;
3173  //delete[] seq; seq = NULL;
3174  //delete[] X; X = NULL;
3175  //delete[] I; I = NULL;
3176  //delete[] l; l = NULL;
3177
3178  /* @= */
3179  return;
3180 }
3181 #endif