Wrapper for Clustal Omega.
[jabaws.git] / binaries / src / clustalo / src / hhalign / hhalignment-C.h
1 /* -*- mode: c; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
2
3 /*********************************************************************
4  * Clustal Omega - Multiple sequence alignment
5  *
6  * Copyright (C) 2010 University College Dublin
7  *
8  * Clustal-Omega is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU General Public License as
10  * published by the Free Software Foundation; either version 2 of the
11  * License, or (at your option) any later version.
12  *
13  * This file is part of Clustal-Omega.
14  *
15  ********************************************************************/
16
17 /*
18  *  RCS $Id: hhalignment-C.h 236 2011-04-14 11:30:04Z fabian $
19  */
20
21
22 /*
23  * Changelog: Michael Remmert made changes to hhalign stand-alone code
24  * FS implemented some of the changes on 2010-10-28 -> MR1
25  *
26  * Note: MR seems to have changed all [aijk]++ to ++[aijk],
27  * FS did not do that on 2010-10-28
28  */
29
30 // hhalignment.C
31
32 //////////////////////////////////////////////////////////////////////////////
33 //// Class Alignment
34 //////////////////////////////////////////////////////////////////////////////
35
36 // hhalignment.C
37
38 #ifndef MAIN
39 #define MAIN
40 #include <iostream> // cin, cout, cerr
41 #include <fstream> // ofstream, ifstream
42 #include <stdio.h> // printf
43 using std::cout;
44 using std::cerr;
45 using std::endl;
46 using std::ios;
47 using std::ifstream;
48 using std::ofstream;
49 #include <stdlib.h> // exit
50 #include <string> // strcmp, strstr
51 #include <math.h> // sqrt, pow
52 #include <limits.h> // INT_MIN
53 #include <float.h> // FLT_MIN
54 #include <time.h> // clock
55 #include <ctype.h> // islower, isdigit etc
56 #include "util-C.h" // imax, fmax, iround, iceil, ifloor, strint, strscn, strcut, substr, uprstr, uprchr, Basename etc.
57 #include "list.h" // list data structure
58 #include "hash.h" // hash data structure
59 #include "hhdecl-C.h"
60 #include "hhutil-C.h" // imax, fmax, iround, iceil, ifloor, strint, strscn, strcut, substr, uprstr, uprchr, Basename etc.
61 #include "hhhmm.h"
62 #endif
63
64
65 enum {KEEP_NOT = 0, KEEP_CONDITIONALLY, KEEP_ALWAYS};
66
67 //////////////////////////////////////////////////////////////////////////////
68 // Class Alignment
69 //////////////////////////////////////////////////////////////////////////////
70
71
72 //////////////////////////////////////////////////////////////////////////////
73 // Object constructor
74 //////////////////////////////////////////////////////////////////////////////
75 Alignment::Alignment(int maxseq, int maxres)
76 {
77
78     //printf(">>>>>>>>%s:%s:%d: maxseq=%d, maxres=%d\n", __FUNCTION__, __FILE__, __LINE__, maxseq, maxres); /* (FS) */
79   longname = new(char[DESCLEN]);
80   sname = new(char*[maxseq+2]); /* MR1 */
81   seq = new(char*[maxseq+2]); /* MR1 */
82   l = new(int[maxres]);
83   X = new(char*[maxseq+2]);  /* MR1 */
84   I = new(short unsigned int*[maxseq+2]); /* MR1 */
85   keep = new(char[maxseq+2]); /* MR1 */
86   display = new(char[maxseq+2]); /* MR1 */
87   wg = new(float[maxseq+2]); /* MR1 */
88   nseqs = new(int[maxres+2]); /* MR1 */
89   N_in=L=0;
90   nres=NULL; // number of residues per sequence k
91   first=NULL; // first residue in sequence k
92   last=NULL; // last residue in sequence k
93   ksort=NULL; // sequence indices sorted by descending nres[k]
94   name[0]='\0'; // no name defined yet
95   longname[0]='\0'; // no name defined yet
96   fam[0]='\0'; // no name defined yet
97   file[0]='\0'; // no name defined yet
98   readCommentLine = '0'; /* MR1 */
99 }
100
101 //////////////////////////////////////////////////////////////////////////////
102 // Object destructor
103 //////////////////////////////////////////////////////////////////////////////
104 Alignment::~Alignment()
105 {
106   delete[] longname; longname = NULL;
107   for(int k=0; k<N_in; k++)
108     {
109       delete[] sname[k]; sname[k] = NULL;
110       delete[] seq[k]; seq[k] = NULL;
111       delete[] X[k]; X[k] = NULL;
112       delete[] I[k]; I[k] = NULL;
113     }
114   delete[] sname; sname = NULL;
115   delete[] seq; seq = NULL;
116   delete[] X; X = NULL;
117   delete[] I; I = NULL;
118   delete[] l; l = NULL;
119   delete[] keep; keep = NULL;
120   delete[] display; display = NULL;
121   delete[] wg; wg = NULL;
122   delete[] nseqs; nseqs = NULL;
123   delete[] nres; nres = NULL;
124   delete[] first; first = NULL;
125   delete[] last; last = NULL;
126   delete[] ksort; ksort = NULL;
127 }
128
129
130 /**
131  * @brief Reads in an alignment from file into matrix seq[k][l] as ASCII
132  */
133 void 
134 Alignment::Read(FILE* inf, char infile[], char* firstline)
135 {
136   int l; // Postion in alignment incl. gaps (first=1)
137   int h; // Position in input line (first=0)
138   int k; // Index of sequence being read currently (first=0)
139   char line[LINELEN]=""; // input line
140   //char cur_seq[MAXCOL]; // Sequence currently read in
141   char *cur_seq=new(char[par.maxColCnt]);
142   char* cur_name; // Sequence currently read in
143   int linenr=0; // current line number in input file
144   char skip_sequence=0;
145   RemoveExtension(file,infile); //copy rootname (w/o path) of infile into file variable of class object
146
147   kss_dssp=ksa_dssp=kss_pred=kss_conf=kfirst=-1;
148   n_display=0;
149   N_in=0;
150   N_filtered=0;
151   N_ss=0;
152   cur_seq[0]=' '; // overwrite '\0' character at beginning to be able to do strcpy(*,cur_seq)
153   l=1; k=-1;
154
155   // Does firstline already contain first line of file?
156   if (firstline!= NULL) strcpy(line,firstline);
157
158   /////////////////////////////////////////////////////////////////////////
159   // Read infile line by line
160   /* FIXME: not safe to use MAXSEQ, however, don't think we ever get here (FS) */
161   while(firstline || (fgetline(line,LINELEN,inf) && (k<MAXSEQ))) /* FIXME: FS introduced () around &&, precedence! MR1 */
162       {
163           linenr++;
164           firstline=NULL;
165           if (line[0]=='>') //line contains sequence name
166               {
167                   if (k>=MAXSEQ-1)
168                       {
169                           if (v>=1 && k>=MAXSEQ)
170                               cerr<<endl<<"WARNING: maximum number "<<MAXSEQ<<" of sequences exceded in file "<<infile<<"\n";
171                           break;
172                       }
173                   cur_name=line+1; //beginning of current sequence name
174                   if (k>=0) //if this is at least the second name line
175                       {
176                           if (strlen(cur_seq)==0)
177                               {
178                                   cerr<<endl<<"Error: sequence "<<sname[k]<<" contains no residues."<<endl;
179                                   exit(1);
180                               }
181
182                           // Create space for residues and paste new sequence in
183                           seq[k]=new(char[strlen(cur_seq)+2]);
184                           if (!seq[k]) MemoryError("array for input sequences");
185                           X[k]=new(char[strlen(cur_seq)+2]);
186                           if (!X[k]) MemoryError("array for input sequences");
187                           I[k]=new(short unsigned int[strlen(cur_seq)+2]);
188                           if (!I[k]) MemoryError("array for input sequences");
189                           strcpy(seq[k],cur_seq);
190                       }
191                   skip_sequence=0;
192
193                   k++;
194                   l=1; //position in current sequence (first=1)
195
196                   // display[k]= 0: do not show in Q-T alignments 1: show if not filtered out later 2: show in any case (do not filter out)
197                   // keep[k] = 0: do not include in profile 1: include if not filtered out later 2: include in any case (do not filter out)
198                   /* {KEEP_NOT=0, KEEP_CONDITIONALLY=1, KEEP_ALWAYS=2} */
199                   if (line[1]=='@') cur_name++; //skip @-character in name
200                   if (!strncmp(line,">ss_dssp",8)) {
201                       if (kss_dssp<0) {display[k]=2; n_display++; keep[k]=KEEP_NOT; kss_dssp=k; N_ss++;} else {skip_sequence=1; k--; continue;}
202                   }
203                   else if (!strncmp(line,">sa_dssp",8)) {
204                       if (ksa_dssp<0) {display[k]=KEEP_ALWAYS; n_display++; keep[k]=KEEP_NOT; ksa_dssp=k; N_ss++;} else {skip_sequence=1; k--; continue;}
205                   }
206                   else if (!strncmp(line,">ss_pred",8)) {
207                       if (kss_pred<0) {display[k]=KEEP_ALWAYS; n_display++; keep[k]=KEEP_NOT; kss_pred=k; N_ss++;} else {skip_sequence=1; k--; continue;}
208                   }
209                   else if (!strncmp(line,">ss_conf",8)) {
210                       if (kss_conf<0) {display[k]=KEEP_ALWAYS; n_display++; keep[k]=KEEP_NOT; kss_conf=k; N_ss++;} else {skip_sequence=1; k--; continue;}
211                   }
212                   else if (!strncmp(line,">ss_",4) || !strncmp(line,">sa_",4)) {
213                       display[k]=KEEP_ALWAYS; n_display++; keep[k]=KEEP_NOT; N_ss++;
214                   }
215                   else if (!strncmp(line,">aa_",4)) { // ignore sequences beginning with ">aa_"
216                       skip_sequence=1; k--; continue;
217                   }
218                   //store first real seq
219                   else if (kfirst<0)
220                       {
221                           char word[NAMELEN];
222                           strwrd(word,line); // Copies first word in ptr to str
223                           if (strstr(word,"_consensus"))
224                               {display[k]=2; keep[k]=0; n_display++; kfirst=k;} /* MR1 */
225                           else
226                               {display[k]=keep[k]=KEEP_ALWAYS; n_display++; kfirst=k;}
227                       }
228                   //store all sequences
229                   else if (par.mark==0) {display[k]=keep[k]=KEEP_CONDITIONALLY; n_display++;}
230                   //store sequences up to nseqdis
231                   else if (line[1]=='@'&& n_display-N_ss<par.nseqdis) {display[k]=keep[k]=KEEP_ALWAYS; n_display++;}
232                   else {display[k]=KEEP_NOT; keep[k]=KEEP_CONDITIONALLY;}
233
234                   // store sequence name
235                   if (v>=4) printf("Reading seq %-16.16s k=%3i n_displ=%3i display[k]=%i keep[k]=%i\n",cur_name,k,n_display,display[k],keep[k]);
236                   sname[k] = new(char[strlen(cur_name)+1]);
237                   if (!sname[k]) {MemoryError("array for sequence names");}
238                   strcpy(sname[k],cur_name);
239               } // end if(line contains sequence name)
240
241           else if (line[0]=='#') // Commentary line?
242               {
243                   // #PF01367.9 5_3_exonuc: 5'-3' exonuclease, C-terminal SAM fold; PDB 1taq, 1bgx (T:271-174), 1taq (271-174)
244                   if (name[0]) continue; // if already name defined: skip commentary line
245                   char *ptr1, *ptr2;
246                   ptr1=strscn_(line+1); // set ptr1 to first non-whitespace character after '#' -> AC number
247                   strncpy(longname,ptr1,DESCLEN-1); // copy whole commentary line after '# ' into longname
248                   longname[DESCLEN-1]='\0';
249                   strtr(longname,""," ");
250                   ptr2=strcut_(ptr1); // cut after AC number and set ptr2 to first non-whitespace character after AC number
251                   // strcpy(fam,ptr1); // copy AC number to fam
252                   // if (!strncmp(fam,"PF",2)) strcut_(fam,'.'); // if PFAM identifier contains '.' cut it off
253                   // strcut_(ptr2); // cut after first word ...
254                   strcpy(name,ptr1); // ... and copy first word into name
255                   readCommentLine = '1'; /* MR1 */
256               }
257
258           //line contains sequence residues or SS information and does not belong to a >aa_ sequence
259           else if (!skip_sequence)
260               {
261                   if (v>=4) cout<<line<<"\n"; //DEBUG
262                   if (k==-1 && v)
263                       {
264                           cerr<<endl<<"WARNING: No sequence name preceding following line in "<<infile<<":\n\'"<<line<<"\'\n";
265                           continue;
266                       }
267
268                   h=0; //counts characters in current line
269
270                   // Check whether all characters are correct; store into cur_seq
271                   if (keep[k] || (k == kfirst) ) // normal line containing residues /* MR1 */
272                       {
273                           while (h<LINELEN && line[h]>'\0' && l</*MAXCOL*/par.maxColCnt-1)
274                               {
275                                   if (aa2i(line[h])>=0) // ignore white-space characters ' ', \t and \n (aa2i()==-1)
276                                       {cur_seq[l]=line[h]; l++;}
277                                   else if (aa2i(line[h])==-2 && v)
278                                       cerr<<endl<<"WARNING: invalid symbol \'"<<line[h]<<"\' at pos. "<<h<<" in line "<<linenr<<" of "<<infile<<"\n";
279                                   h++;
280                               }
281                       }
282                   else if (k==kss_dssp) // lines with dssp secondary structure states (. - H E C S T G B)
283                       {
284                           while (h<LINELEN && line[h]>'\0' && l</*MAXCOL*/par.maxColCnt-1)
285                               {
286                                   if (ss2i(line[h])>=0 && ss2i(line[h])<=7)
287                                       {cur_seq[l]=ss2ss(line[h]); l++;}
288                                   else if (v)
289                                       cerr<<endl<<"WARNING: invalid symbol \'"<<line[h]<<"\' at pos. "<<h<<" in line "<<linenr<<" of "<<infile<<"\n";
290                                   h++;
291                               }
292                       }
293                   else if (k==ksa_dssp) // lines with dssp solvent accessibility states (. - ???)
294                       {
295                           while (h<LINELEN && line[h]>'\0' && l</*MAXCOL*/par.maxColCnt-1)
296                               {
297                                   if (sa2i(line[h])>=0)
298                                       cur_seq[l++]=line[h];
299                                   else if (v)
300                                       cerr<<endl<<"WARNING: invalid symbol \'"<<line[h]<<"\' at pos. "<<h<<" in line "<<linenr<<" of "<<infile<<"\n";
301                                   h++;
302                               }
303                       }
304                   else if (k==kss_pred) // lines with predicted secondary structure (. - H E C)
305                       {
306                           while (h<LINELEN && line[h]>'\0' && l</*MAXCOL*/par.maxColCnt-1)
307                               {
308                                   if (ss2i(line[h])>=0 && ss2i(line[h])<=3)
309                                       {cur_seq[l]=ss2ss(line[h]); l++;}
310                                   else if (v)
311                                       cerr<<endl<<"WARNING: invalid symbol \'"<<line[h]<<"\' at pos. "<<h<<" in line "<<linenr<<" of "<<infile<<"\n";
312                                   h++;
313                               }
314                       }
315                   else if (k==kss_conf) // lines with confidence values should contain only 0-9, '-', or '.'
316                       {
317                           while (h<LINELEN && line[h]>'\0' && l</*MAXCOL*/par.maxColCnt-1)
318                               {
319                                   if (line[h]=='-' || line[h]=='.' || (line[h]>='0' && line[h]<='9'))
320                                       {cur_seq[l]=line[h]; l++;}
321                                   else if (v)
322                                       cerr<<endl<<"WARNING: invalid symbol \'"<<line[h]<<"\' at pos. "<<l<<" in line "<<linenr<<" of "<<infile<<"\n";
323                                   h++;
324                               }
325                       }
326                   else if (display[k]) // other lines such as >sa_pred etc
327                       {
328                           while (h<LINELEN && line[h]>'\0' && l</*MAXCOL*/par.maxColCnt-1)
329                               {
330                                   if (line[h]=='-' || line[h]=='.' || (line[h]>='0' && line[h]<='9') || (line[h]>='A' && line[h]<='B'))
331                                       {cur_seq[l]=line[h]; l++;}
332                                   else if (v)
333                                       cerr<<endl<<"WARNING: invalid symbol \'"<<line[h]<<"\' at pos. "<<l<<" in line "<<linenr<<" of "<<infile<<"\n";
334                                   h++;
335                               }
336                       }
337                   if (v && l>=/*MAXCOL*/par.maxColCnt-1) 
338                       {
339                           cerr<<endl<<"WARNING: maximum number of residues "<</*MAXCOL*/par.maxColCnt-2<<" exceded in sequence "<<sname[k]<<"\n";
340                           skip_sequence=1;
341                       }
342                   cur_seq[l]='\0'; //Ensure that cur_seq ends with a '\0' character
343               } //end else
344
345       }
346   /////////////////////////////////////////////////////////////////////////
347
348
349   if (k>=0) //if at least one sequence was read in
350       {
351           seq[k]=new(char[strlen(cur_seq)+2]);
352           if (!seq[k]) MemoryError("array for input sequences");
353           X[k]=new(char[strlen(cur_seq)+2]);
354           if (!X[k]) MemoryError("array for input sequences");
355           I[k]=new(short unsigned int[strlen(cur_seq)+2]);
356           if (!I[k]) MemoryError("array for input sequences");
357           strcpy(seq[k],cur_seq);
358       }
359   else
360       {cerr<<endl<<"Error: no sequences found in file "<<infile<<"\n"; exit(1);}
361
362   N_in = k+1;
363
364   // Set name, longname, fam
365   if (!*name) // longname, name and family were not set by '#...' line yet -> extract from first sequence
366       {
367           char* ptr;
368           // strtr(sname[kfirst],"~"," "); // 'transpose': replaces the tilde with a blanc everywhere in sname[kfirst]
369           strncpy(longname,sname[kfirst],DESCLEN-1); // longname is name of first sequence
370           longname[DESCLEN-1]='\0';
371           strncpy(name,sname[kfirst],NAMELEN-1); // Shortname is first word of longname...
372           name[NAMELEN-1]='\0';
373           ptr = strcut(name); // ...until first white-space character
374           if (ptr && islower(ptr[0]) && ptr[1]=='.' && isdigit(ptr[2])) //Scop family code present as second word?
375               {
376                   lwrstr(name); // Transform upper case to lower case
377                   strcut(ptr); // Non-white-space characters until next white-space character..
378                   strcpy(fam,ptr); // ...are the SCOP familiy code
379               }
380           else if (name[0]=='P' && name[1]=='F' && isdigit(name[2]) && isdigit(name[3]) ) //Pfam code
381               {
382                   strcpy(fam,name); // set family name = Pfam code
383               }
384       }
385   
386   
387   
388   delete[] cur_seq; cur_seq = NULL;
389   
390   // Checking for warning messages
391   if (v==0) return;
392   if (v>=2) cout<<"Read "<<infile<<" with "<<N_in<<" sequences\n";
393   if (v>=3) cout<<"Query sequence for alignment has number "<<kfirst<<" (0 is first)\n";
394   return;
395 }
396
397 /*
398  * At this point GetSeqsFromHMM() slots in, however,
399  * only needed in hhbliys.C, so will skip it for moment, MR1
400  */
401
402
403 /////////////////////////////////////////////////////////////////////////////
404 /**
405  * @brief  Convert ASCII in seq[k][l] to int (0-20) in X[k][i],
406  *  throw out all insert states, record their number in I[k][i]
407  *  and store sequences to be displayed in seq[k] */
408 /////////////////////////////////////////////////////////////////////////////
409 void 
410 Alignment::Compress(const char infile[])
411 {
412     int i; // Index for match state (first=1)
413     int l; // Postion in alignment incl. gaps (first=1)
414     int k; // Index for sequences (first=0)
415     int a; // amino acid index
416     char c;
417     int unequal_lengths=0; /* k: seq k doesn't have same number
418                               of match states as seq 0 => WARNING */
419     /* points to next character in seq[k] to be written */
420     /*static short unsigned int h[MAXSEQ];*/
421     /*short*/ unsigned int *h = NULL; /* short may lead to overflow for long alignments, FS, r235 -> r236 */
422
423     h = new(/*short*/ unsigned int[N_in+2]); /* short -> overflow, FS, r235 -> r236 */
424     float *percent_gaps = NULL; /* FS, 2010-Nov */
425     char *match_state = NULL;  /* FS, 2010-Nov */
426
427
428     // Initialize 
429     for (k=0;k<N_in; k++) 
430         {I[k][0]=0;}
431
432     if (v>=3)
433         {
434             if (par.M==1)
435                 cout<<"Using match state assignment by capital letters (a2m format)\n";
436             else if (par.M==2) cout<<"Using percentage-rule match state assignment\n";
437             else if (par.M==3) cout<<"Using residues of first sequence as match states\n";
438         }
439
440     // Create matrices X and I with amino acids represented by integer numbers
441     switch(par.M)
442         {
443
444             /////////////////////////////////////////////////////////////////////////
445             /* a2m/a3m format: match states capital case,
446                inserts lower case, delete states '-', inserted gaps '.'
447                The confidence values for ss prediction are interpreted as follows:
448                0-9:match states(!) '-' :match state '.':insert */
449         case 1:
450         default:
451
452             // Warn if alignment is ment to be -M first or -M NN instead of A2M/A3M
453             if (v>=2 && strchr(seq[kfirst],'-') ) // Seed/query sequence contains a gap ...
454                 {
455                     for (k=1; k<N_in; k++)
456                         if (strpbrk(seq[k],"abcdefghiklmnpqrstuvwxyz.")) break;
457                     if (k==N_in) // ... but alignment contains no lower case residue
458                         printf("WARNING: input alignment %s looks like aligned FASTA instead of A2M/A3M format. Consider using '-M first' or '-M 50'\n",infile);
459                 }
460
461             // Remove '.' characters from seq[k]
462             for(k=0; k<N_in; k++)
463                 {
464                     char* ptrS=seq[k]; // pointer to source: character in seq[k]
465                     char* ptrD=seq[k]; // pointer to destination: seq[k]
466                     while(1) // omit '.' symbols
467                         {
468                             if (*ptrS!='.') {*ptrD=*ptrS; ptrD++;} //leave out '.' symbols
469                             if (!*ptrS) break;
470                             ptrS++;
471                         }
472                 }
473             L=/*MAXRES*/par.maxResLen-2; // needed because L=imin(L,i)
474             for (k=0; k<N_in; k++)
475                 {
476                     i=1; l=1; // start at i=1, not i=0!
477                     if (keep[k]) //skip >ss_dssp, >ss_pred, >ss_conf, >aa_... sequences
478                         {
479                             while((c=seq[k][l++])) // assign residue to c at same time
480                                 {
481                                     if (c>='a' && c<='z') I[k][i-1]++;//insert state = lower case character
482                                     else if (c!='.') //match state = upper case character
483                                         {
484                                             X[k][i]=aa2i(c);
485                                             I[k][i]=0;
486                                             i++;
487                                         }
488                                 }
489                         }
490                     else if (k==kss_dssp || k==kss_pred) // does alignment contain sequence of secondary structure states?
491                         {
492                             while((c=seq[k][l++])) // assign residue to c at same time
493                                 if (c!='.' && !(c>='a' && c<='z')) X[k][i++]=ss2i(c); //match state = upper case character
494                         }
495                     else if (k==ksa_dssp) // does alignment contain sequence of prediction confidence values?
496                         {
497                             while((c=seq[k][l++])) // assign residue to c at same time
498                                 if (c!='.' && !(c>='a' && c<='z')) X[k][i++]=sa2i(c); //match state = upper case character
499                         }
500                     else if (k==kss_conf) // does alignment contain sequence of prediction confidence values?
501                         {
502                             while((c=seq[k][l++])) // assign residue to c at same time
503                                 if (c!='.') X[k][i++]=cf2i(c); //match state = 0-9 or '-'
504                         }
505                     else if (k==kfirst)        // does alignment contain sequence of prediction confidence values?
506                         {
507                             while((c=seq[k][l++]))  // assign residue to c at same time
508                                 if (c!='.')
509                                     {
510                                         X[k][i]=aa2i(c);
511                                         I[k][i]=0;
512                                         ++i;
513                                     }
514                         }
515                     else continue;
516                     i--;
517                     if (L!=i && L!=/*MAXRES*/par.maxResLen-2 && !unequal_lengths) unequal_lengths=k; //sequences have different lengths
518                     L=imin(L,i);
519                 }
520             if (unequal_lengths) break;
521
522             //Replace GAP with ENDGAP for all end gaps /* MR1 */
523             for (k=0; k<N_in; ++k)
524                 {
525                     if (!keep[k]) continue;
526                     for (i=1; i<=L && X[k][i]==GAP; i++) X[k][i]=ENDGAP; /* MR1: NOTE i++ <- ++i */
527                     for (i=L; i>=1 && X[k][i]==GAP; i--) X[k][i]=ENDGAP; /* MR1 */
528                 }
529
530             for (i=1; i<=L; i++) this->l[i]=i; //assign column indices to match states
531             if (L<=0)
532                 {
533                     cout<<"\nError: Alignment in "<<infile<<" contains no match states. Consider using -M first or -M <int> option"<<endl;
534                     exit(1);
535                 }
536             
537             if (L==/*MAXRES*/par.maxResLen-2 && v>=2) 
538                 {
539                     printf("WARNING: Number of match columns too large. Only first %i match columns will be kept!\n",L);
540                     break;
541                 }
542             if (v>=2) cout<<"Alignment in "<<infile<<" contains "<<L<<" match states\n";
543             break;
544
545             /////////////////////////////////////////////////////////////////////////
546             // gap-rule assignment of match states
547         case 2:
548             int nl[NAA+2]; //nl[a] = number of seq's with amino acid a at position l
549             /* Note: allocating statically is fine most of the time 
550                but when the sequences/profiles get really long 
551                we might run out of memory, so must really do it dynamically. 
552                had to move declaration of float *percent_gaps out of switch()
553             */
554             //float percent_gaps[MAXCOL]; //percentage of gaps in column k (with weighted sequences)
555             percent_gaps = new(float[par.maxColCnt]);
556
557             //determine number of columns L in alignment
558             L=strlen(seq[kfirst])-1;
559
560             // Conversion to integer representation, checking for unequal lengths and initialization
561             if (nres==NULL) nres=new(int[N_in]);
562             for (k=0; k<N_in; k++)
563                 {
564                     if (!keep[k]) continue;
565                     int nr=0;
566                     wg[k]=0; nres[k]=0;
567                     for (l=1; l<=L; l++)
568                         {
569                             X[k][l]=aa2i(seq[k][l]);
570                             if (X[k][l]<NAA) nr++;
571                         }
572                     nres[k]=nr;
573                     if (seq[k][L+1]!='\0' && !unequal_lengths) unequal_lengths=k;
574                 }
575             if (unequal_lengths) break;
576
577             // Quick and dirty calculation of the weight per sequence wg[k]
578             for (l=1; l<=L; l++) // for all positions l in alignment
579                 {
580                     int naa=0; //number of different amino acids
581                     for (a=0; a<20; a++) nl[a]=0;
582                     for (k=0; k<N_in; k++) if (keep[k]) nl[ (int)X[k][l]]++;
583                     for (a=0; a<20; a++) if(nl[a]) naa++;
584                     if (!naa) naa=1; //naa=0 when column consists of only gaps and Xs (=ANY)
585                     for (k=0; k<N_in; k++)
586                         if (keep[k] && (X[k][l]<20) )
587                             {
588                                 //wg[k]+=1.0/float(nl[ (int)X[k][l]]*naa*nres[k]+30.0); /* original version */
589                                 wg[k] += 1.0/float(nl[ (int)X[k][l]]*naa*(nres[k]+30.0)); /* MR1 */
590                                 // wg[k] += 1.0/float(nl[ (int)X[k][l]]*(nres[k]+30.0)); /* MR1 commented out */
591                                 // wg[k] += (naa-1.0)/float(nl[ (int)X[k][l]]*(nres[k]+30.0)); /* MR1 commented out */
592                             }
593                 } /* 1=l<=L*/
594
595             //Replace GAP with ENDGAP for all end gaps
596             for (k=0; k<N_in; ++k)
597                 {
598                     if (!keep[k]) continue;
599                     for (i=1; i<=L && X[k][i]==GAP; i++) X[k][i]=ENDGAP; /* MR1: NOTE i++ <- ++i */
600                     for (i=L; i>=1 && X[k][i]==GAP; i--) X[k][i]=ENDGAP; /* MR1 */
601                 }
602
603             // Add up percentage of gaps
604             for (l=1; l<=L; l++)
605                 {
606                     float res=0;
607                     float gap=0;
608                     for (k=0; k< N_in; k++){
609                         if (keep[k]){
610                             if ( X[k][l]<GAP) res+=wg[k]; /* MR1, AA or ANY, changed from <ANY */
611                             else if ( X[k][l] != ENDGAP) gap+=wg[k];  /* MR1, else: GAP. ENDGAPs are ignored for counting percentage */
612                         }
613                     }
614                     percent_gaps[l]=100.*gap/(res+gap);
615                     if (v>=4) cout<<"percent gaps["<<l<<"]="<<percent_gaps[l]<<" first seq:"<<seq[0][l]<<"\n";
616                 }
617
618             /* Insert states 'bloat' the HMM,
619                throwing them out 'slims' down the HMM.
620                A slimmer HMM takes less time to construct.
621                However, the marriage of Clustal and Hhalign
622                is particularly sensitive to residues
623                at the very end of the profile; these I call
624                'telomeres'. Telomeres must not be shed when
625                throwing out insert states, for the telomeres
626                we set the match threshold to 100%.
627              */
628 #define MGAP_LOGIC 0
629 #define TELOMERE_LOGIC 1
630 #define TELOMERE_DYNAMIC 0
631
632 #define ALWAYS_ACCEPT 101.0 /* do NOT change this parameter, must be >=100,
633                                make slightly bigger than 100% -- to be sure to be sure */
634 #define DEFAULT_MGAPS 100.0 /* Soeding's default is 50, omega default prior to telomere logic was 100
635                                FIXME: this used to be par.Mgaps,
636                                in a later version re-introduce par.Mgaps to keep this value flexible */
637 #define TELOMER_LENGTH 10   /* this parameter must be > 0 (unless DEFAULT_MGAPS=100),
638                                if it is too big (L/2) then telomere logic has no effect,
639                                don't think it should be changed (much) */
640 #define TELOMER_FRACTION 0.10
641             //#define HMM_MIN_LENGTH 0.923
642 #define HMM_MIN_LENGTH 0.950
643 #define FORTRAN_OFFSET 1
644             double dDefaultMgaps;
645             dDefaultMgaps = DEFAULT_MGAPS;
646
647 #if TELOMERE_LOGIC /* turn telomere logic on (1) or off (0) */
648             int iTelomereLength;
649
650 #if TELOMERE_DYNAMIC /* keep telomere length 'dynamic' */
651             iTelomereLength = TELOMER_LENGTH > (int)(L*TELOMER_FRACTION) ? TELOMER_LENGTH : (int)(L*TELOMER_FRACTION);
652 #else
653             iTelomereLength = TELOMER_LENGTH;
654 #endif /* this was dynamic telomere */
655 #endif /* this was telomere logic */
656
657             /* if HMMs get too small (much smaller than profile length L)
658                then one is liable to get a back-tracking error.
659                So we should ensure that the DEFAULT_MGAPS parameter does NOT
660                shrink the HMM too much.
661                take percentage-gap vector, sort it, and fix dDefaultMgaps,
662                such that at least (HMM_MIN_LENGTH)*(L) are left
663              */
664 #if MGAP_LOGIC /* try to adapt Mgaps to size of final HMM */
665             {
666                 float *pfPercentGaps = NULL;
667                 if (NULL == (pfPercentGaps = (float *)malloc((L+1)*sizeof(float)))){
668                     printf("%s:%s:%d: could not malloc %d float for sorted percent-gaps\n",
669                            __FUNCTION__, __FILE__, __LINE__, L+1);
670                     dDefaultMgaps = DEFAULT_MGAPS;
671                 }
672                 else {
673                     for (l = 0; l < L; l++) {
674                         pfPercentGaps[l] = percent_gaps[l+FORTRAN_OFFSET];
675                     }
676                     qsort(pfPercentGaps, L, sizeof(float), CompFltAsc);
677
678                     dDefaultMgaps = pfPercentGaps[(int)(HMM_MIN_LENGTH*L)];
679                     if (dDefaultMgaps < DEFAULT_MGAPS){
680                         //printf("Mgaps = %f <- %f\n", DEFAULT_MGAPS, dDefaultMgaps);
681                         dDefaultMgaps = DEFAULT_MGAPS;
682                     }
683                     else {
684                         //printf("Mgaps = %f\n", dDefaultMgaps);
685                     }
686
687                     free(pfPercentGaps); pfPercentGaps = NULL;
688                 }
689             }
690 #endif /* tried to adapt Mgaps to size of final HMM */
691
692             // Throw out insert states and keep only match states
693             i=0;
694             for (k=0; k<N_in; k++) {h[k]=1; seq[k][0]='-';}
695             for (l=1; l<=L; l++)
696                 {
697 #if TELOMERE_LOGIC
698                     float fMgaps = ALWAYS_ACCEPT;
699                     if ( (l < iTelomereLength) || (L-l < iTelomereLength) ){
700                         /* residue is in telomere, always retain this position */
701                         fMgaps = ALWAYS_ACCEPT;
702                     }
703                     else if (0){
704                         /* FIXME: would like to put a transition phase in here,
705                            where the Mgap value gradually goes down from 100 to DEFAULT_MGAPS,
706                            however, may not be necessary and will make code more clunky */
707                     }
708                     else {
709                         /* position is in centre of sequence,
710                            retain position if less than DEFAULT_MGAPS% gaps at this position,
711                            for example, if DEFAULT_MGAPS=30 throw out if more than 30% gap.
712                            conversely, if DEFAULT_MGAPS=100 throw out if more than 100% gaps,
713                            which can never happen, so always retain */
714                         fMgaps = dDefaultMgaps;
715                     }
716                     if (percent_gaps[l] <= fMgaps)
717 #else /* this was telomere logic */
718                     if (percent_gaps[l]<=float(par.Mgaps))
719 #endif /* this was Soeding default */
720                         {
721                             if (i>=/*MAXRES*/par.maxResLen-2) {
722                                 if (v>=1)
723                                     printf("WARNING: Number of match columns too large. Only first %i match columns will be kept!\n",i);
724                                 break;
725                             }
726                             i++;
727                             this->l[i]=l;
728                             for (k=0; k<N_in; k++)
729                                 {
730                                     if (keep[k])
731                                         {
732                                             seq[k][h[k]++]=MatchChr(seq[k][l]);
733                                             X[k][i]=X[k][l];
734                                             I[k][i]=0;
735                                         }
736                                     else if (k==kss_dssp || k==kss_pred)
737                                         {
738                                             seq[k][h[k]++]=MatchChr(seq[k][l]);
739                                             X[k][i]=ss2i(seq[k][l]);
740                                         }
741                                     else if (k==ksa_dssp)
742                                         {
743                                             seq[k][h[k]++]=MatchChr(seq[k][l]);
744                                             X[k][i]=sa2i(seq[k][l]);
745                                         }
746                                     else if (k==kss_conf)
747                                         {
748                                             seq[k][h[k]++]=seq[k][l];
749                                             X[k][i]=cf2i(seq[k][l]);
750                                         }
751                                 }
752                         }
753                     else
754                         {
755                             for (k=0; k<N_in; k++)
756                                 if (keep[k] && X[k][l]<GAP)
757                                     {
758                                         I[k][i]++;
759                                         seq[k][h[k]++]=InsertChr(seq[k][l]);
760                                     }
761                         }
762                 }
763             for (k=0; k<N_in; k++) seq[k][h[k]]='\0';
764
765             //printf("%d\t%d\t%d\tN/L/M\n", N_in, L, i); /* -------- FIXME  */
766
767             if (v>=2) cout<<"Alignment in "<<infile<<" contains "<<L<<" columns and "<<i<<" match states\n";
768             L = i; //Number of match states
769
770             delete[] percent_gaps; percent_gaps = NULL;
771             break;
772
773
774             ////////////////////////////////////////////////////////////////////////
775             // Using residues of first sequence as match states
776         case 3:
777             /* Note: allocating statically is fine most of the time 
778                but when the sequences/profiles get really long 
779                we might run out of memory, so must really do it dynamically. 
780                had to move declaration of float *percent_gaps out of switch()
781             */
782             //char match_state[MAXCOL]; //1: column assigned to match state 0: insert state
783             match_state = new(char[par.maxColCnt]);
784
785             // Determine number of columns L in alignment
786             L=strlen(seq[0]+1);
787             if (v>=3) printf("Length of first seq = %i\n",L);
788             // Check for sequences with unequal lengths
789             for (k=1; k<N_in; k++)
790                 if (int(strlen(seq[k]+1))!=L) {unequal_lengths=k; break;}
791             if (unequal_lengths) break;
792
793             // Determine match states: seq kfirst has residue at pos l -> match state
794             for (l=1; l<=L; l++)
795                 if (isalpha(seq[kfirst][l])) match_state[l]=1; else match_state[l]=0;
796             // Throw out insert states and keep only match states
797             for (k=0; k<N_in; k++) {h[k]=1; seq[k][0]='-';}
798             i=0;
799             for (l=1; l<=L; l++)
800                 {
801                     if (match_state[l]) // does sequence 0 have residue at position l?
802                         {
803                             if (i>=/*MAXRES*/par.maxResLen-2) {
804                                 if (v>=1)
805                                     printf("WARNING: Number of match columns too large. Only first %i match columns will be kept!\n",i);
806                                 break;
807                             }
808                             i++;
809                             this->l[i]=l;
810                             for (k=0; k<N_in; k++)
811                                 {
812                                     if (keep[k])
813                                         {
814                                             seq[k][h[k]++]=MatchChr(seq[k][l]);
815                                             X[k][i]=aa2i(seq[k][l]);
816                                             I[k][i]=0;
817                                         }
818                                     else if (k==kss_dssp || k==kss_pred)
819                                         {
820                                             seq[k][h[k]++]=MatchChr(seq[k][l]);
821                                             X[k][i]=ss2i(seq[k][l]);
822                                         }
823                                     else if (k==ksa_dssp)
824                                         {
825                                             seq[k][h[k]++]=MatchChr(seq[k][l]);
826                                             X[k][i]=sa2i(seq[k][l]);
827                                         }
828                                     else if (k==kss_conf)
829                                         {
830                                             seq[k][h[k]++]=seq[k][l];
831                                             X[k][i]=cf2i(seq[k][l]);
832                                         }
833                                 }
834                         }
835                     else
836                         {
837                             for (k=0; k<N_in; k++)
838                                 if (keep[k] && aa2i(seq[k][l])<GAP)
839                                     {
840                                         I[k][i]++;
841                                         seq[k][h[k]++]=InsertChr(seq[k][l]);
842                                     }
843                         }
844                 }
845             for (k=0; k<N_in; k++) seq[k][h[k]]='\0';
846
847             //Replace GAP with ENDGAP for all end gaps /* MR1 */
848             for (k=0; k<N_in; ++k)
849                 {
850                     if (!keep[k]) continue;
851                     for (i=1; i<=L && X[k][i]==GAP; i++) X[k][i]=ENDGAP; /* MR1, note i++ <- ++i */
852                     for (i=L; i>=1 && X[k][i]==GAP; i--) X[k][i]=ENDGAP; /* MR1 */
853                 }
854
855             if (v>=2) cout<<"Alignment in "<<infile<<" contains "<<L<<" columns and "<<i<<" match states\n";
856             L = i; //Number of match states
857
858             delete[] match_state; match_state = NULL;
859             break;
860
861         } //end switch()
862     ///////////////////////////////////////////////////////////////////////////
863
864
865     // Error
866     if (unequal_lengths)
867         {
868             strcut(sname[unequal_lengths]);
869             cerr<<endl<<"Error: sequences in "<<infile<<" do not all have the same number of columns, \ne.g. first sequence and sequence "<<sname[unequal_lengths]<<".\n";
870             if(par.M==1) cerr<<".\nCheck input format for '-M a2m' option and consider using '-M first' or '-M 50'\n";
871             exit(1);
872         }
873
874     // Avert user about -cons option?
875     if (v>=2 && !par.cons)
876         {
877             for (i=1; i<=L; i++)
878                 if (X[kfirst][i]==GAP)
879                     {
880                         printf("NOTE: Use the '-cons' option to calculate a consensus sequence as first sequence of the alignment.\n");
881                         break;
882                     }
883         }
884     /* MR1
885     //Replace GAP with ENDGAP for all end gaps
886     for (k=0; k<N_in; k++)
887     {
888     if (!keep[k]) continue;
889     for (i=1; i<=L && X[k][i]==GAP; i++) X[k][i]=ENDGAP;
890     for (i=L; i>=1 && X[k][i]==GAP; i--) X[k][i]=ENDGAP;
891     }*/
892
893     // DEBUG
894     if (v>=4)
895         for (k=0; k<N_in; k++)
896             {
897                 if (!display[k]) continue;
898                 cout<<">"<<sname[k]<<"\n";
899                 if (k==kss_dssp || k==kss_pred) {for (i=1; i<=L; i++) cout<<char(i2ss(X[k][i]));}
900                 else if (k==kss_conf) {for (i=1; i<=L; i++) cout<<char(i2cf(X[k][i]));}
901                 else if (k==ksa_dssp) {for (i=1; i<=L; i++) cout<<char(i2sa(X[k][i]));}
902                 else
903                     {
904                         for (i=1; i<=L; i++) cout<<char(i2aa(X[k][i]));
905                         cout<<"\n";
906                         for (i=1; i<=L; i++)
907                             if (I[k][i]==0) cout<<"-"; else if (I[k][i]>9) cout<<"X"; else cout<<I[k][i];
908                     }
909                 cout<<"\n";
910             }
911
912     delete[](h); h = NULL;
913 }
914
915
916 /**
917  * @brief Remove sequences with seq. identity larger than seqid percent
918  *(remove the shorter of two) or coverage<cov_thr
919  *
920  * FIXME: originally max_seqid is a variable that is the cutoff
921  *  above which sequences are thrown out. We want to throw out sequences
922  *  when building the HMM but not for display, there we want to keep all.
923  *  This should be really easy, but there is some hidden stuff going on
924  *  in this function, so I did a minimal-invasive change and just stuck
925  *  (effectively) a hard-wired 100 instead of the variable.
926  *  At a later stage we should get rid of this function alltogether
927  *  as it does gobble up some time (and is quadratic in noof sequences, I think)
928  *  FS, 2010-10-04
929  */
930 ////////////////////////////////////////////////////////////////////////////
931 /*
932  */
933 inline int 
934 Alignment::FilterForDisplay(int max_seqid, int coverage, int qid, float qsc, int N)
935 {
936
937     /* FIXME
938      * by just returning n_display and not doing anything
939      * I think we display everything and not do any work for it
940      */
941     return n_display; /* FS, 2010-10-04*/
942
943
944     if (par.mark) return n_display;
945     char *dummy = new(char[N_in+1]);
946     int vtmp=v, seqid;
947     v=0;
948     n_display=0;
949     if (kss_dssp>=0) display[kss_dssp]=KEEP_NOT;
950     if (ksa_dssp>=0) display[ksa_dssp]=KEEP_NOT;
951     if (kss_pred>=0) display[kss_pred]=KEEP_NOT;
952     if (kss_conf>=0) display[kss_conf]=KEEP_NOT;
953     for (seqid=imin(10,max_seqid); n_display<N && seqid<=max_seqid; seqid++)
954         {
955             for (int k=0; k<N_in; k++) dummy[k]=display[k];
956             n_display = Filter2(dummy,coverage,qid,qsc,20,seqid,0);
957             // printf("Seqid=%3i n_display=%4i\n",seqid,n_display);
958         }
959     if (n_display>N)
960         {
961             for (int k=0; k<N_in; k++) dummy[k]=display[k];
962             n_display = Filter2(dummy,coverage,qid,qsc,20,--(--seqid),0);
963         }
964     v=vtmp;
965     for (int k=0; k<N_in; k++) display[k]=dummy[k];
966     if (kss_dssp>=0) {display[kss_dssp]=KEEP_CONDITIONALLY; n_display++;}
967     if (ksa_dssp>=0) {display[ksa_dssp]=KEEP_CONDITIONALLY; n_display++;}
968     if (kss_pred>=0) {display[kss_pred]=KEEP_CONDITIONALLY; n_display++;}
969     if (kss_conf>=0) {display[kss_conf]=KEEP_CONDITIONALLY; n_display++;}
970     delete[] dummy; dummy = NULL;
971     return n_display;
972 }
973
974 /////////////////////////////////////////////////////////////////////////////////////
975 // Remove sequences with seq. identity larger than seqid percent (remove the shorter of two) or coverage<cov_thr
976 /////////////////////////////////////////////////////////////////////////////////////
977 inline int Alignment::Filter(int max_seqid, int coverage, int qid, float qsc, int N)
978 {
979     return Filter2(keep,coverage,qid,qsc,20,max_seqid,N);
980 }
981
982 /////////////////////////////////////////////////////////////////////////////
983 /*
984  * @brief Select set of representative sequences in the multiple sequence alignment
985  *
986  * Filter criteria:
987  * Remove sequences with coverage of query less than "coverage" percent
988  * Remove sequences with sequence identity to query of less than "qid" percent
989  * If Ndiff==0, remove sequences with seq. identity larger than seqid2(=max_seqid) percent
990  * If Ndiff>0, remove sequences with minimum-sequence-identity filter of between seqid1
991  * and seqid2 (%), where the minimum seqid threshold is determined such that,
992  * in all column blocks of at least WMIN=25 residues, at least Ndiff sequences are left.
993  * This ensures that in multi-domain proteins sequences covering one domain are not
994  * removed completely because sequences covering other domains are more diverse.
995  *
996  * Allways the shorter of two compared sequences is removed (=> sort sequences by length first).
997  * Please note: sequence identity of sequence x with y when filtering x is calculated as
998  * number of residues in sequence x that are identical to an aligned residue in y / number of residues in x
999  * Example: two sequences x and y are 100% identical in their overlapping region but one overlaps by 10% of its
1000  * length on the left and the other by 20% on the right. Then x has 10% seq.id with y and y has 20% seq.id. with x.
1001  */
1002 //////////////////////////////////////////////////////////////////////////////
1003 int 
1004 Alignment::Filter2(char keep[], int coverage, int qid, float qsc, int seqid1, int seqid2, int Ndiff)
1005 {
1006     // In the beginnning, keep[k] is 1 for all regular amino acid sequences and 0 for all others (ss_conf, ss_pred,...)
1007     // In the end, keep[k] will be 1 for all regular representative sequences kept in the alignment, 0 for all others
1008     char* in=new(char[N_in+1]); // in[k]=1: seq k has been accepted; in[k]=0: seq k has not yet been accepted at current seqid
1009     char* inkk=new(char[N_in+1]); // inkk[k]=1 iff in[ksort[k]]=1 else 0;
1010     int* Nmax=new(int[L+2]); // position-dependent maximum-sequence-identity threshold for filtering /* MR1, used to be called idmax*/
1011     int* idmaxwin=new(int[L+2]); // minimum value of idmax[i-WFIL,i+WFIL]
1012     int* seqid_prev=new(int[N_in+1]); // maximum-sequence-identity threshold used in previous round of filtering (with lower seqid)
1013     int* N=new(int[L+2]); // N[i] number of already accepted sequences at position i
1014     const int WFIL=25; // see previous line
1015
1016     int diffNmax=Ndiff;       // current  maximum difference of Nmax[i] and Ndiff /* MR1 */
1017     int diffNmax_prev=0;      // previous maximum difference of Nmax[i] and Ndiff /* MR1 */
1018
1019     int seqid; // current maximum value for the position-dependent maximum-sequence-identity thresholds in idmax[]
1020     int seqid_step=0;         // previous increment of seqid /* MR1 */
1021
1022     float diff_min_frac; // minimum fraction of differing positions between sequence j and k needed to accept sequence k
1023     float qdiff_max_frac=0.9999-0.01*qid; // maximum allowable number of residues different from query sequence
1024     int diff; // number of differing positions between sequences j and k (counted so far)
1025     int diff_suff; // number of differing positions between sequences j and k that would be sufficient
1026     int qdiff_max; // maximum number of residues required to be different from query
1027     int cov_kj; // upper limit of number of positions where both sequence k and j have a residue
1028     int first_kj; // first non-gap position in sequence j AND k
1029     int last_kj; // last non-gap position in sequence j AND k
1030     int kk, jj; // indices for sequence from 1 to N_in
1031     int k, j; // kk=ksort[k], jj=ksort[j]
1032     int i; // counts residues
1033     int n; // number of sequences accepted so far
1034
1035
1036     // Initialize in[k]
1037     for (n=k=0; k<N_in; k++) if (keep[k]==KEEP_ALWAYS) {in[k]=2/*KEEP_ALWAYS??*/; n++;} else in[k]=0;
1038
1039     // Determine first[k], last[k]?
1040     if (first==NULL)
1041         {
1042             first=new(int[N_in]);// first non-gap position in sequence k
1043             last =new(int[N_in]);// last  non-gap position in sequence k
1044             for (k=0; k<N_in; k++) // do this for ALL sequences, not only those with in[k]==1 (since in[k] may be display[k])
1045                 {
1046                     for (i=1; i<=L; i++) if (X[k][i]<NAA) break;
1047                     first[k]=i;
1048                     for (i=L; i>=1; i--) if (X[k][i]<NAA) break;
1049                     last[k]=i;
1050                 }
1051         }
1052
1053     // Determine number of residues nres[k]?
1054     if ( (nres==NULL)  || (sizeof(nres)<N_in*sizeof(int)) )
1055         {
1056             nres=new(int[N_in]);
1057             for (k=0; k<N_in; k++) // do this for ALL sequences, not only those with in[k]==1 (since in[k] may be display[k])
1058                 {
1059                     int nr=0;
1060                     for (i=first[k]; i<=last[k]; i++)
1061                         if (X[k][i]<NAA) nr++;
1062                     nres[k]=nr;
1063                     // printf("%20.20s nres=%3i first=%3i last=%3i\n",sname[k],nr,first[k],last[k]);
1064                 }
1065         }
1066
1067     // Sort sequences according to length; afterwards, nres[ksort[kk]] is sorted by size
1068     if (ksort==NULL)
1069         {
1070             ksort=new(int[N_in]); // never reuse alignment object for new alignment with more sequences
1071             for (k=0; k<N_in; k++) ksort[k]=k;
1072             QSortInt(nres,ksort,kfirst+1,N_in-1,-1); //Sort sequences after kfirst (query) in descending order
1073         }
1074     for (kk=0; kk<N_in; kk++) inkk[kk]=in[ksort[kk]];
1075
1076     // Initialize N[i], idmax[i], idprev[i]
1077     for (i=1; i<first[kfirst]; i++) N[i]=0;
1078     for (i=first[kfirst]; i<=last[kfirst]; i++) N[i]=1;
1079     for (i=last[kfirst]+1; i<=L; i++) N[i]=0;
1080     //for (i=1; i<=L; i++) {idmax[i]=seqid1; idmaxwin[i]=-1;}
1081     for (i=1; i<=L; ++i) {Nmax[i]=0; idmaxwin[i]=-1;} /* MR1 */
1082     for (k=0; k<N_in; k++) seqid_prev[k]=-1;
1083     if (Ndiff<=0 || Ndiff>=N_in) {seqid1=seqid2; Ndiff=N_in; diffNmax=Ndiff;}
1084
1085     // Check coverage and sim-to-query criteria for each sequence k
1086     for (k=0; k<N_in; k++)
1087         {
1088             if (keep[k]==KEEP_NOT || keep[k]==KEEP_ALWAYS) continue; // seq k not regular sequence OR is marked sequence
1089             if (100*nres[k]<coverage*L) {keep[k]=KEEP_NOT; continue;} // coverage too low? => reject once and for all
1090
1091             float qsc_sum=0.0;
1092
1093             // Check if score-per-column with query is at least qsc
1094             if (qsc>-10)
1095                 {
1096                     float qsc_min = qsc*nres[k]; // minimum total score of seq k with query
1097
1098                     int gapq=0, gapk=0; // number of consecutive gaps in query or k'th sequence at position i
1099                     for (int i=first[k]; i<=last[k]; i++)
1100                         {
1101                             if (X[k][i]<20)
1102                                 {
1103                                     gapk=0;
1104                                     if (X[kfirst][i]<20)
1105                                         {
1106                                             gapq=0;
1107                                             qsc_sum += S[(int)X[kfirst][i]][(int)X[k][i]];
1108                                         }
1109                                     else if (gapq++) qsc_sum-=PLTY_GAPEXTD; else qsc_sum-=PLTY_GAPOPEN;
1110                                 }
1111                             else if (X[kfirst][i]<20)
1112                                 {
1113                                     gapq=0;
1114                                     if (gapk++) qsc_sum-=PLTY_GAPEXTD; else qsc_sum-=PLTY_GAPOPEN;
1115                                 }
1116                         }
1117                     // printf("k=%3i qsc=%6.2f\n",k,qsc_sum);
1118                     if (qsc_sum<qsc_min) {keep[k]=KEEP_NOT; continue;} // too different from query? => reject once and for all
1119                 }
1120
1121             //Check if sequence similarity with query at least qid?
1122             if (qdiff_max_frac<0.999)
1123                 {
1124                     qdiff_max=int(qdiff_max_frac*nres[k]+0.9999);
1125                     // printf("k=%-4i nres=%-4i qdiff_max=%-4i first=%-4i last=%-4i",k,nres[k],qdiff_max,first[k],last[k]);
1126                     diff=0;
1127                     for (int i=first[k]; i<=last[k]; i++)
1128                         // enough different residues to reject based on minimum qid with query? => break
1129                         if (X[k][i]<20 && X[k][i]!=X[kfirst][i] && ++diff>=qdiff_max) break;
1130                     // printf(" diff=%4i\n",diff);
1131                     if (diff>=qdiff_max) {keep[k]=KEEP_NOT; continue;} // too different from query? => reject once and for all
1132                 }
1133             // printf(" qsc=%6.2f qid=%6.2f \n",qsc_sum/nres[k],100.0*(1.0-(float)(diff)/nres[k]));
1134         }
1135
1136     if (seqid1>seqid2)
1137         {
1138             for (n=k=0; k<N_in; k++) if (keep[k]>KEEP_NOT) n++;
1139             return n;
1140         }
1141
1142     // Successively increment idmax[i] at positons where N[i]<Ndiff
1143     //for (seqid=seqid1; seqid<=seqid2; seqid+=1+(seqid>=50)) /* MR1 */
1144     seqid=seqid1;
1145     while (seqid<=seqid2)
1146         {
1147             /*
1148             // Update idmax[i]
1149             for (i=1; i<=L; i++) if (N[i]<Ndiff) idmax[i]=seqid;
1150
1151             // Update idmaxwin[i] as minimum of idmax[i-WFIL,i+WFIL]. If idmaxwin[] has not changed then stop
1152             char stop=1;
1153             for (i=1; i<=L; i++)
1154             {
1155             int idmax_min=seqid2;
1156             for (j=imax(1,imin(L-2*WFIL+1,i-WFIL)); j<=imin(L,imax(2*WFIL,i+WFIL)); j++)
1157             if (idmax[j]<idmax_min) idmax_min=idmax[j];
1158             if (idmax_min>idmaxwin[i]) stop=0; // idmaxwin[i] has changed => do not stop
1159             idmaxwin[i]=idmax_min;
1160             }
1161             */
1162             char stop=1;
1163             // Update Nmax[i]
1164             diffNmax_prev = diffNmax;
1165             diffNmax = 0;
1166             for (i=1; i<=L; ++i)
1167                 {
1168                     int max=0;
1169                     for (j=imax(1,imin(L-2*WFIL+1,i-WFIL)); j<=imin(L,imax(2*WFIL,i+WFIL)); ++j)
1170                         if (N[j]>max) max=N[j];
1171                     if (Nmax[i]<max) Nmax[i]=max;
1172                     if (Nmax[i]<Ndiff)
1173                         {
1174                             stop=0;
1175                             idmaxwin[i]=seqid;
1176                             if (diffNmax<Ndiff-Nmax[i]) diffNmax=Ndiff-Nmax[i];
1177                         }
1178
1179                 }
1180
1181             //printf("seqid=%3i  diffNmax_prev= %-4i   diffNmax= %-4i   n=%-5i  N_in-N_ss=%-5i\n",seqid,diffNmax_prev,diffNmax,n,N_in-N_ss);
1182
1183             if (stop) break;
1184
1185             // // DEBUG
1186             // printf("idmax ");
1187             // for (i=1; i<=L; i++) printf("%2i ",idmax[i]);
1188             // printf("\n");
1189             // printf("idmaxwin ");
1190             // for (i=1; i<=L; i++) printf("%2i ",idmaxwin[i]);
1191             // printf("\n");
1192             // printf("N[i] ");
1193             // for (i=1; i<=L; i++) printf("%2i ",N[i]);
1194             // printf("\n");
1195
1196             // Loop over all candidate sequences kk (-> k)
1197             for (kk=0; kk<N_in; kk++)
1198                 {
1199                     if (inkk[kk]) continue; // seq k already accepted
1200                     k=ksort[kk];
1201                     if (!keep[k]) continue; // seq k is not regular aa sequence or already suppressed by coverage or qid criterion
1202                     if (keep[k]==KEEP_ALWAYS) {inkk[kk]=2; continue;} // accept all marked sequences (no n++, since this has been done already)
1203
1204                     // Calculate max-seq-id threshold seqidk for sequence k (as maximum over idmaxwin[i])
1205                     if (seqid>=100) {in[k]=inkk[kk]=1; n++; continue;}
1206                     float seqidk=seqid1;
1207                     for (i=first[k]; i<=last[k]; i++)
1208                         if (idmaxwin[i]>seqidk) seqidk=idmaxwin[i];
1209                     if (seqid==seqid_prev[k]) continue; // sequence has already been rejected at this seqid threshold => reject this time
1210                     seqid_prev[k]=seqid;
1211                     diff_min_frac =0.9999-0.01*seqidk; // min fraction of differing positions between sequence j and k needed to accept sequence k
1212
1213                     // Loop over already accepted sequences
1214                     for (jj=0; jj<kk; jj++)
1215                         {
1216                             if (!inkk[jj]) continue;
1217                             j=ksort[jj];
1218                             first_kj=imax(first[k],first[j]);
1219                             last_kj =imin(last[k],last[j]);
1220                             cov_kj = last_kj-first_kj+1;
1221                             diff_suff=int(diff_min_frac*imin(nres[k],cov_kj)+0.999); // nres[j]>nres[k] anyway because of sorting /* MR1 0.999 */
1222                             diff=0;
1223                             for (int i=first_kj; i<=last_kj; i++)
1224                                 {
1225                                     // enough different residues to accept? => break
1226                                     if (X[k][i]>=NAA || X[j][i]>=NAA)
1227                                         cov_kj--;
1228                                     else
1229                                         if (X[k][i]!=X[j][i] && ++diff>=diff_suff) break; // accept (k,j)
1230                                 }
1231                             // // DEBUG
1232                             // printf("%20.20s with %20.20s: diff=%i diff_min_frac*cov_kj=%f diff_suff=%i nres=%i cov_kj=%i\n",sname[k],sname[j],diff,diff_min_frac*cov_kj,diff_suff,nres[k],cov_kj);
1233                             // printf("%s\n%s\n\n",seq[k],seq[j]);
1234
1235                             //if (float(diff)<fmin(diff_min_frac*cov_kj,diff_suff)) break; //similarity > acceptace threshold? Reject! /* MR1 */
1236                             if (diff<diff_suff && float(diff)<=diff_min_frac*cov_kj) break; //dissimilarity < acceptace threshold? Reject! /* MR1 */
1237
1238
1239                         }
1240                     if (jj>=kk) // did loop reach end? => accept k. Otherwise reject k (the shorter of the two)
1241                         {
1242                             in[k]=inkk[kk]=1;
1243                             n++;
1244                             for (i=first[k]; i<=last[k]; i++) N[i]++; // update number of sequences at position i
1245                             // printf("%i %20.20s accepted\n",k,sname[k]);
1246                         }
1247                     // else
1248                     // {
1249                     // printf("%20.20s rejected: too similar with seq %20.20s diff=%i diff_min_frac*cov_kj=%f diff_suff=%i nres=%i cov_kj=%i\n",sname[k],sname[j],diff,diff_min_frac*cov_kj,diff_suff,nres[k],cov_kj);
1250                     // printf("%s\n%s\n\n",seq[k],seq[j]);
1251                     // }
1252
1253                 } // End Loop over all candidate sequences kk
1254
1255             // // DEBUG
1256             // printf("\n");
1257             // printf("seqid_prev[k]= \n");
1258             // for (k=0; k<N_in; k++) printf("%2i ",seqid_prev[k]);
1259             // printf("\n");
1260
1261             // Increment seqid /* MR1 */
1262             seqid_step = imax(1,imin(5,diffNmax/(diffNmax_prev-diffNmax+1)*seqid_step/2));
1263             seqid += seqid_step;
1264
1265         } // End Loop over seqid
1266
1267     if (v>=2)
1268         {
1269             printf("%i out of %i sequences passed filter (",n,N_in-N_ss);
1270             if (par.coverage)
1271                 printf("%i%% min coverage, ",coverage);
1272             if (qid)
1273                 printf("%i%% min sequence identity to query, ",qid);
1274             if (qsc>-10)
1275                 printf("%.2f bits min score per column to query, ",qsc);
1276             if (Ndiff<N_in && Ndiff>0)
1277                 printf("up to %i%% position-dependent max pairwise sequence identity)\n",seqid);
1278             else
1279                 printf("%i%% max pairwise sequence identity)\n",seqid1);
1280         }
1281
1282     for (k=0; k<N_in; k++) keep[k]=in[k];
1283     delete[] in; in = NULL;
1284     delete[] inkk; inkk = NULL;
1285     //delete[] idmax; idmax = NULL;
1286     delete[] Nmax; /* MR1 */
1287     delete[] idmaxwin; idmaxwin = NULL;
1288     delete[] seqid_prev; seqid_prev = NULL;
1289     delete[] N; N = NULL;
1290 #if 0
1291     printf("%s:%s:%d: sequences accepted = %d/%d\n", __FUNCTION__, __FILE__, __LINE__, n, N_in-N_ss);
1292 #endif
1293     return n;
1294 }
1295
1296
1297
1298 /* MR1: the Alignment::HomologyFilter is no longer needed in hhalign-stand-alone */
1299 /////////////////////////////////////////////////////////////////////////////
1300 /**
1301  * @brief Filter for min score per column coresc with core query profile,
1302  *   defined by coverage_core and qsc_core 
1303  */
1304 /////////////////////////////////////////////////////////////////////////////
1305 int 
1306 Alignment::HomologyFilter(int coverage_core, float qsc_core, float coresc)
1307 {
1308     const int seqid_core=90; //maximum sequence identity in core alignment
1309     const int qid_core=0;
1310     const int Ndiff_core=0;
1311     int n;
1312     HMM qcore;
1313     char* coreseq=new(char[N_in]); // coreseq[k]=1 if sequence belongs to core of alignment (i.e. it is very similar to query)
1314     for (int k=0; k<N_in; k++) coreseq[k]=keep[k]; // Copy keep[] into coreseq[]
1315
1316     // Remove sequences with seq. identity larger than seqid percent (remove the shorter of two)
1317     int v1=v; v=1;
1318     n = Filter2(coreseq,coverage_core,qid_core,qsc_core,seqid_core,seqid_core,Ndiff_core);
1319     v=v1;
1320     if (v>=2)
1321         {
1322             printf("%i out of %i core alignment sequences passed filter (",n,N_in-N_ss);
1323             if (par.coverage_core)
1324                 printf("%i%% min coverage, ",coverage_core);
1325             if (qid_core)
1326                 printf("%i%% min sequence identity to query, ",qid_core);
1327             if (qsc_core>-10)
1328                 printf("%.2f bits min score per column to query, ",qsc_core);
1329             printf("%i%% max pairwise sequence identity)\n",seqid_core);
1330         }
1331
1332     // Calculate bare AA frequencies and transition probabilities -> qcore.f[i][a], qcore.tr[i][a]
1333     FrequenciesAndTransitions(qcore,coreseq);
1334
1335     // Add transition pseudocounts to query -> q.p[i][a] (gapd=1.0, gape=0.333, gapf=gapg=1.0, gaph=gapi=1.0, gapb=1.0
1336     qcore.AddTransitionPseudocounts(1.0,0.333,1.0,1.0,1.0,1.0,1.0);
1337
1338     // Generate an amino acid frequency matrix from f[i][a] with full pseudocount admixture (tau=1) -> g[i][a]
1339     qcore.PreparePseudocounts();
1340
1341     // Add amino acid pseudocounts to query: qcore.p[i][a] = (1-tau)*f[i][a] + tau*g[i][a]
1342     qcore.AddAminoAcidPseudocounts(2,1.5,2.0,1.0); // pcm=2, pca=1.0, pcb=2.5, pcc=1.0
1343
1344     // Filter out all sequences below min score per column with qcore
1345     n=FilterWithCoreHMM(keep, coresc, qcore);
1346
1347     if (v>=2) cout<<n<<" out of "<<N_in-N_ss<<" sequences filtered by minimum score-per-column threshold of "<<qsc_core<<"\n";
1348     delete[] coreseq; coreseq = NULL;
1349     return n;
1350 }
1351
1352
1353 /////////////////////////////////////////////////////////////////////////////////////
1354 /**
1355  * @brief Filter out all sequences below a minimum score per column with profile qcore
1356  */
1357 int 
1358 Alignment::FilterWithCoreHMM(char in[], float coresc, HMM& qcore)
1359 {
1360     int k; // count sequences in alignment
1361     int i; // column in query alignment
1362     int a; // amino acid (0..19)
1363     int n=1; // number of sequences that passed filter
1364     float** logodds=new(float*[L+1]); // log-odds ratios for HMM qcore
1365     char gap; // 1: previous state in seq k was a gap 0: previous state in seq k was an amino acid
1366     float score; // score of sequence k aligned with qcore
1367
1368     for (i=1; i<=L; i++) logodds[i]=new(float[21]);
1369
1370     // Determine first[k], last[k]?
1371     if (first==NULL)
1372         {
1373             first=new(int[N_in]);// first non-gap position in sequence k
1374             last =new(int[N_in]);// last non-gap position in sequence k
1375             for (k=0; k<N_in; k++) // do this for ALL sequences, not only those with in[k]==1 (since in[k] may be display[k])
1376                 {
1377                     for (i=1; i<=L; i++) if (X[k][i]<NAA) break;
1378                     first[k]=i;
1379                     for (i=L; i>=1; i--) if (X[k][i]<NAA) break;
1380                     last[k]=i;
1381                 }
1382         }
1383
1384     // Determine number of residues nres[k]?
1385     if (nres==NULL)
1386         {
1387             nres=new(int[N_in]);
1388             for (k=0; k<N_in; k++) // do this for ALL sequences, not only those with in[k]==1 (since in[k] may be display[k])
1389                 {
1390                     int nr=0;
1391                     for (i=first[k]; i<=last[k]; i++)
1392                         if (X[k][i]<NAA) nr++;
1393                     nres[k]=nr;
1394                     // printf("%20.20s nres=%3i first=%3i last=%3i\n",sname[k],nr,f,l);
1395                 }
1396         }
1397
1398     // Precalculate the log-odds for qcore
1399     for (i=1; i<=L; i++)
1400         {
1401             for (a=0; a<NAA; a++)
1402                 logodds[i][a]=fast_log2(qcore.p[i][a]/pb[a]);
1403             logodds[i][ANY]=-0.5; // half a bit penalty for X
1404
1405             // printf(" A R N D C Q E G H I L K M F P S T W Y V\n");
1406             // printf("%6i ",i);
1407             // for (a=0; a<20; ++a) fprintf(stdout,"%5.1f ",100*qcore.f[i][a]);
1408             // printf("\n");
1409             // printf(" ");
1410             // for (a=0; a<20; ++a) fprintf(stdout,"%5.1f ",100*qcore.g[i][a]);
1411             // printf("\n");
1412             // printf(" ");
1413             // for (a=0; a<20; ++a) fprintf(stdout,"%5.1f ",100*qcore.p[i][a]);
1414             // printf("\n");
1415             // printf(" ");
1416             // for (a=0; a<20; ++a) fprintf(stdout,"%5.1f ",100*pb[a]);
1417             // printf("\n");
1418             // printf(" ");
1419             // for (a=0; a<20; ++a) fprintf(stdout,"%5.2f ",fast_log2(qcore.p[i][a]/pb[a]));
1420             // printf("\n");
1421         }
1422
1423     // Main loop: test all sequences k
1424     for (k=kfirst+1; k<N_in; k++)
1425         {
1426             if (!in[k]) continue; // if in[k]==0 sequence k will be suppressed directly
1427
1428             float score_M=0.0;
1429             float score_prev=0.0;
1430
1431             // Calculate score of sequence k with core HMM
1432             score=0; gap=0;
1433             for (i=first[k]; i<=last[k]; i++)
1434                 {
1435                     score_M=0.0;
1436                     if (X[k][i]<=ANY) // current state is Match
1437                         {
1438                             score_M=logodds[i][ (int)X[k][i]];
1439                             score+=logodds[i][ (int)X[k][i]];
1440                             if (gap) score+=qcore.tr[i][D2M]; else score+=qcore.tr[i][M2M];
1441                             gap=0;
1442                         }
1443                     else if (X[k][i]==GAP) // current state is Delete (ignore ENDGAPs)
1444                         {
1445                             if (gap) score+=qcore.tr[i][D2D]; else score+=qcore.tr[i][M2D];
1446                             gap=1;
1447                         }
1448                     if (I[k][i]) score+=qcore.tr[i][M2I]+(I[k][i]-1)*qcore.tr[i][I2I]+qcore.tr[i][I2M];
1449                     // if (k==2) printf("i=%3i %c:%c score_M=%6.2f score=%6.2f score_sum=%6.2f \n",i,i2aa(X[kfirst][i]),i2aa(X[k][i]),score_M,score-score_prev,score);
1450                     score_prev=score;
1451                 }
1452
1453             printf("k=%3i score=%6.2f\n",k,score);
1454             if (score<nres[k]*coresc) in[k]=0; else n++;// reject sequence k?
1455         }
1456     for (i=1; i<=L; i++){
1457         delete[] logodds[i]; logodds[i] = NULL;
1458     }
1459     delete[] logodds; logodds = NULL;
1460     return n;
1461 }
1462
1463
1464 /* MR1 */
1465 #if 0
1466 /////////////////////////////////////////////////////////////////////////////////////
1467 /**
1468  * @brief Filter alignment to given diversity/Neff
1469  */
1470 bool 
1471 Alignment::FilterNeff()
1472 {
1473     int v1=v;
1474     v=v1-1;
1475     const float TOLX=0.001;
1476     const float TOLY=0.02;
1477     char dummy[N_in+1];
1478     for (int k=0; k<N_in; ++k) dummy[k]=keep[k];
1479     float x=0.0,y=0.0;
1480     float x0=-1.0;
1481     float x1=+2.0;
1482     float y0=filter_by_qsc(x0,dummy);
1483     float y1=filter_by_qsc(x1,dummy);
1484     int i=2;
1485     while (y0-par.Neff>0 && par.Neff-y1>0)
1486         {
1487             x = x0 + (par.Neff-y0)*(x1-x0)/(y1-y0); // linear interpolation between (x0,y0) and (x1,y1)
1488             y = filter_by_qsc(x,dummy);
1489             if (v>=2) printf(" %3i  x0=%6.3f -> %6.3f     x=%6.3f -> %6.3f     x1=%6.3f -> %6.3f \n",++i,x0,y0,x,y,x1,y1);
1490             if (y>par.Neff) {x0=x; y0=y;} else {x1=x; y1=y;}
1491             if (fabs(par.Neff-y)<TOLY || x1-x0<TOLX) break;
1492         }
1493     v=v1;
1494
1495     if (y0>=par.Neff && y1<=par.Neff)
1496         {
1497             // Write filtered alignment WITH insert states (lower case) to alignment file
1498             if (v>=2) printf("Found Neff=%6.3f at filter threshold qsc=%6.3f\n",y,x);
1499             return true;
1500         }
1501     else if (v>=1)
1502         printf("Diversity of unfiltered alignment %.2f is below target diversity %.2f. No alignment written\n",y0,par.Neff);
1503
1504     return false;
1505 }
1506
1507 float Alignment::filter_by_qsc(float qsc, char* dummy)
1508 {
1509     HMM q;
1510     for (int k=0; k<N_in; ++k) keep[k]=dummy[k];
1511     Filter2(keep,par.coverage,0,qsc,par.max_seqid+1,par.max_seqid,0);
1512     FrequenciesAndTransitions(q);
1513     //   printf("qsc=%4.1f  N_filtered=%-3i  Neff=%6.3f\n",qsc,n,q.Neff_HMM);
1514     return q.Neff_HMM;
1515 }
1516 #endif
1517
1518 /////////////////////////////////////////////////////////////////////////////////////
1519 /**
1520  * @brief  Calculate AA frequencies q.p[i][a] and transition probabilities q.tr[i][a] from alignment
1521  */
1522 void 
1523 Alignment::FrequenciesAndTransitions(HMM& q, char* in)
1524 {
1525     int k; // index of sequence
1526     int i; // position in alignment
1527     int a; // amino acid (0..19)
1528     int ni[NAA+3]; // number of times amino acid a occurs at position i
1529     int naa; // number of different amino acids
1530
1531     if (v>=3)
1532         cout<<"Calculating position-dependent weights on subalignments\n";
1533
1534     if (in==NULL) in=keep; // what's this good for?
1535
1536     if (N_filtered>1)
1537         {
1538             for (k=0; k<N_in; k++) wg[k]=0.0; // initialized wg[k]
1539             // Calculate global weights
1540             for (i=1; i<=L; i++) // for all positions i in alignment
1541                 {
1542                     for (a=0; a<20; a++) ni[a]=0;
1543                     for (k=0; k<N_in; k++) if (in[k]) ni[ (int)X[k][i]]++;
1544                     naa=0; for (a=0; a<20; a++) if(ni[a]) naa++;
1545                     if (!naa) naa=1; //naa=0 when column consists of only gaps and Xs (=ANY)
1546                     for (k=0; k<N_in; k++)
1547                         if (in[k] && X[k][i]<20)
1548                             wg[k] += 1.0/float(ni[ (int)X[k][i]]*naa*(nres[k]+30));
1549                     // ensure that each residue of a short sequence contributes as much as a residue of a long sequence:
1550                     // contribution is proportional to one over sequence length nres[k] plus 30.
1551                 }
1552             NormalizeTo1(wg,N_in);
1553
1554
1555             // Do pos-specific sequence weighting and calculate amino acid frequencies and transitions
1556             for (k=0; k<N_in; k++) X[k][0]=ENDGAP; // make sure that sequences ENTER subalignment j for j=1
1557             for (k=0; k<N_in; k++) X[k][L+1]=ENDGAP; // does it have an influence?
1558
1559 #ifdef HAVE_OPENMP
1560             if(par.wg != 1)
1561             {
1562                 #pragma omp parallel sections
1563                 {
1564                     #pragma omp section
1565                     Amino_acid_frequencies_and_transitions_from_M_state(q,in); // use subalignments of seqs with residue in i
1566                     #pragma omp section
1567                     Transitions_from_I_state(q,in); // use subalignments of seqs with insert in i
1568                     #pragma omp section
1569                     Transitions_from_D_state(q,in); // use subalignments of seqs with delete in i. Must be last of these three calls if par.wg==1!
1570                 }
1571             }
1572             else
1573             {
1574                 #pragma omp parallel sections
1575                 {
1576                     #pragma omp section
1577                     Amino_acid_frequencies_and_transitions_from_M_state(q,in); // use subalignments of seqs with residue in i
1578                     #pragma omp section
1579                     Transitions_from_I_state(q,in); // use subalignments of seqs with insert in i
1580                 }
1581                 Transitions_from_D_state(q,in); // use subalignments of seqs with delete in i. Must be last of these three calls if par.wg==1!
1582             }
1583 #else
1584             Amino_acid_frequencies_and_transitions_from_M_state(q,in);
1585             Transitions_from_I_state(q,in);
1586             Transitions_from_D_state(q,in);
1587 #endif
1588         }
1589     else // N_filtered==1
1590         {
1591             X[kfirst][0]=X[kfirst][L+1]=ANY; // (to avoid anallowed access within loop)
1592             q.Neff_HMM=1.0f;
1593             for (i=0; i<=L+1; i++) // for all positions i in alignment
1594                 {
1595                     q.Neff_M[i]=1.0f;
1596                     q.Neff_I[i]=q.Neff_D[i]=0.0f;
1597                     for (a=0; a<20; a++) q.f[i][a]=0.0;
1598                     /* this is the crucial change that makes terminal-X work */
1599                     //q.f[i][ (int)(X[kfirst][i]) ] = 1.0; /* MR1 */
1600                     if (X[kfirst][i] < ANY) /* MR1 */
1601                         q.f[i][(unsigned int) X[kfirst][i] ] = 1.0;
1602                     else
1603                         for (a=0; a<20; ++a) q.f[i][a]=pb[a];
1604                     q.tr[i][M2M]=0;
1605                     q.tr[i][M2I]=-100000.0;
1606                     q.tr[i][M2D]=-100000.0;
1607                     q.tr[i][I2M]=-100000.0;
1608                     q.tr[i][I2I]=-100000.0;
1609                     q.tr[i][D2M]=-100000.0;
1610                     q.tr[i][D2D]=-100000.0;
1611                 }
1612             q.tr[0][I2M]=0;
1613             q.tr[L][I2M]=0;
1614             q.tr[0][D2M]=0;
1615             q.Neff_M[0]=q.Neff_I[0]=q.Neff_D[0]=99.999; // Neff_av[0] is used for calculation of transition pseudocounts for the start state
1616         }
1617
1618     if (v>=3)
1619         {
1620             printf("\nMatches:\n");
1621             printf("col Neff nseqs\n");
1622             for (i=1; i<=imin(L,100); i++)
1623                 printf("%3i %5.2f %3i\n",i,q.Neff_M[i],nseqs[i]);
1624
1625             printf("\nInserts:\n");
1626             printf("col Neff nseqs\n");
1627             for (i=1; i<=imin(L,100); i++)
1628                 printf("%3i %5.2f %3i\n",i,q.Neff_I[i],nseqs[i]);
1629
1630             printf("\nDeletes:\n");
1631             printf("col Neff nseqs\n");
1632             for (i=1; i<=imin(L,100); i++)
1633                 printf("%3i %5.2f %3i\n",i,q.Neff_D[i],nseqs[i]);
1634         }
1635
1636     // Copy column information into HMM q
1637     q.L=L;
1638     q.N_in=N_in;
1639     q.N_filtered=N_filtered;
1640     for (i=1; i<=L; i++) q.l[i]=l[i];
1641
1642     // Set names in HMM q
1643     if (strlen(q.name)==0) strcpy(q.name,name);
1644     if (strlen(q.longname)==0) strcpy(q.longname,longname);
1645     if (strlen(q.fam)==0) strcpy(q.fam,fam);
1646     ScopID(q.cl,q.fold,q.sfam,q.fam); // derive superfamily, fold and class code from family name
1647     strcpy(q.file,file); // Store basename of alignment file name in q.file
1648
1649     // Copy sequences to be displayed into HMM
1650     q.nss_dssp=q.nsa_dssp=q.nss_pred=q.nss_conf=q.nfirst=-1;
1651     int n=0;
1652     if (kss_dssp>=0) q.nss_dssp=n++; // copy dssp sequence?
1653     if (ksa_dssp>=0) q.nsa_dssp=n++; // copy dssp sequence?
1654     if (kss_pred>=0) q.nss_pred=n++; // copy psipred sequence?
1655     if (kss_conf>=0) q.nss_conf=n++; // copy confidence value sequence?
1656
1657     // Calculate consensus sequence?
1658     if (par.showcons || par.cons)
1659         {
1660             float maxw;
1661             int maxa;
1662             if (par.showcons)
1663                 {
1664                     // Reserve space for consensus/conservation sequence as Q-T alignment mark-up
1665                     q.ncons=n++;
1666                     q.sname[q.ncons]=new(char[10]);
1667                     if (!q.sname[q.ncons]) {MemoryError("array of names for displayed sequences");}
1668                     strcpy(q.sname[q.ncons],"Consensus");
1669                     q.seq[q.ncons]=new(char[L+2]);
1670                     if (!q.seq[q.ncons]) {MemoryError("array of names for displayed sequences");}
1671                 }
1672             if (par.cons)
1673                 {
1674                     // Reserve space for consensus sequence as first sequence in alignment
1675                     q.nfirst=n++; kfirst=-1;
1676                     q.sname[q.nfirst]=new(char[strlen(name)+11]);
1677                     if (!q.sname[q.nfirst]) {MemoryError("array of names for displayed sequences");}
1678                     strcpy(q.sname[q.nfirst],name);
1679                     strcat(q.sname[q.nfirst],"_consensus");
1680                     q.seq[q.nfirst]=new(char[L+2]);
1681                     if (!q.seq[q.nfirst]) {MemoryError("array of names for displayed sequences");}
1682                 }
1683             // Calculate consensus amino acids using similarity matrix
1684             for (i=1; i<=L; i++)
1685                 {
1686                     maxw=0.0; maxa=0;
1687                     for (a=0; a<20; a++)
1688                         if (q.f[i][a]-pb[a]>maxw) {maxw = q.f[i][a]-pb[a]; maxa = a;}
1689
1690                     if (par.showcons)
1691                         {
1692                             maxw =0.0;
1693                             for (int b=0; b<20; b++) maxw += q.f[i][b]*Sim[maxa][b]*Sim[maxa][b];
1694                             maxw *= q.Neff_M[i]/(q.Neff_HMM+1); // columns with many gaps don't get consensus symbol
1695                             if (maxw>0.6) q.seq[q.ncons][i] = uprchr(i2aa(maxa));
1696                             else if (maxw>0.4) q.seq[q.ncons][i] = lwrchr(i2aa(maxa));
1697                             else q.seq[q.ncons][i] = 'x';
1698                         }
1699                     if (par.cons) q.seq[q.nfirst][i] = uprchr(i2aa(maxa));
1700                 }
1701             if (par.showcons)
1702                 {
1703                     q.seq[q.ncons][0]='-';
1704                     q.seq[q.ncons][L+1]='\0';
1705                 }
1706             if (par.cons)
1707                 {
1708                     q.seq[q.nfirst][0]='-';
1709                     q.seq[q.nfirst][L+1]='\0';
1710                 }
1711         }
1712
1713     // Copy sequences to be displayed from alignment to HMM
1714     for (k=0; k<N_in; k++)
1715         {
1716             int nn;
1717             if (display[k])
1718                 {
1719                     if (0 && (n>=MAXSEQDIS)) {
1720                         /* FIXME: the test was if(n>=MAXSEQDIS),
1721                            this test was necessary because alignment memory was static,
1722                            now it should be dynamic, and should always have the right size,
1723                            there are at least number-of-sequences plus a 'bit' more
1724                            however, I do not know what that 'bit' is likely to be (in the future).
1725                            at the moment it is 1 for the consnseus and 1 for structure,
1726                            but this might change (FS)
1727                          */
1728                         if (par.mark) cerr<<"WARNING: maximum number "<<MAXSEQDIS<<" of sequences for display of alignment exceeded\n";
1729                         break;
1730                     }
1731                     if (k==kss_dssp) nn=q.nss_dssp; // copy dssp sequence to nss_dssp
1732                     else if (k==ksa_dssp) nn=q.nsa_dssp;
1733                     else if (k==kss_pred) nn=q.nss_pred;
1734                     else if (k==kss_conf) nn=q.nss_conf;
1735                     else if (k==kfirst) nn=q.nfirst=n++;
1736                     else nn=n++;
1737                     // strcut(sname[k]," "); // delete rest of name line beginning with two spaces " " // Why this?? Problem for pdb seqs without chain
1738                     q.sname[nn]=new(char[strlen(sname[k])+1]);
1739                     if (!q.sname[nn]) {MemoryError("array of names for displayed sequences");}
1740                     strcpy(q.sname[nn],sname[k]);
1741                     q.seq[nn]=new(char[strlen(seq[k])+1]);
1742                     if (!q.seq[nn]) {MemoryError("array of names for displayed sequences");}
1743                     strcpy(q.seq[nn],seq[k]);
1744                 }
1745         }
1746     q.n_display=n; // how many sequences to be displayed in alignments?
1747
1748     // Copy secondary structure information into HMM
1749     if (kss_dssp>=0)
1750         for (i=1; i<=L; i++) q.ss_dssp[i]=X[kss_dssp][i];
1751     if (ksa_dssp>=0)
1752         for (i=1; i<=L; i++) q.sa_dssp[i]=X[ksa_dssp][i];
1753     if (kss_pred>=0)
1754         {
1755             for (i=1; i<=L; i++) q.ss_pred[i]=X[kss_pred][i];
1756             if (kss_conf>=0)
1757                 for (i=1; i<=L; i++) q.ss_conf[i]=X[kss_conf][i];
1758             else
1759                 for (i=1; i<=L; i++) q.ss_conf[i]=5;
1760         }
1761
1762     q.lamda=0.0;
1763     q.mu=0.0;
1764
1765     // Debug: print occurence of amino acids for each position i
1766     if (v>=2) printf("Effective number of sequences exp(entropy) = %-4.1f\n",q.Neff_HMM); //PRINT
1767     if (v>=3)
1768         {
1769             cout<<"\nMatr: ";
1770             for (a=0; a<20; a++) printf("%4.1f ",100*pb[a]);
1771             cout<<"\nAmino acid frequencies without pseudocounts:\n";
1772             cout<<" A R N D C Q E G H I L K M F P S T W Y V\n";
1773             for (i=1; i<=L; i++)
1774                 {
1775                     printf("%3i: ",i);
1776                     for (a=0; a<20; a++) printf("%4.0f ",100*q.f[i][a]);
1777                     cout<<endl;
1778                 }
1779             cout<<"\n";
1780
1781             printf("\nListing transition probabilities without pseudocounts:\n");
1782             printf(" i M->M M->I M->D I->M I->I D->M D->D Neff_M Neff_I Neff_D\n");
1783             for (i=0; i<=L; i++)
1784                 {
1785                     printf("%4i %6.3f %6.3f %6.3f ",i,pow(2.0,q.tr[i][M2M]),pow(2.0,q.tr[i][M2I]),pow(2.0,q.tr[i][M2D]));
1786                     printf("%6.3f %6.3f ",pow(2.0,q.tr[i][I2M]),pow(2.0,q.tr[i][I2I]));
1787                     printf("%6.3f %6.3f ",pow(2.0,q.tr[i][D2M]),pow(2.0,q.tr[i][D2D]));
1788                     printf("%6.3f %6.3f %6.3f\n",q.Neff_M[i],q.Neff_I[i],q.Neff_D[i]);
1789                 }
1790         }
1791     q.trans_lin=0;
1792     q.has_pseudocounts=false; /* MR1 */
1793
1794     return;
1795 }
1796
1797
1798 /////////////////////////////////////////////////////////////////////////////////////
1799 /*
1800  * FIXME: one of the most time consuming routines (according to gprof on r112)
1801  */
1802 /**
1803  * @brief Calculate freqs q.f[i][a] and transitions q.tr[i][a] (a=MM,MI,MD) with pos-specific subalignments
1804  * Pos-specific weights are calculated like in "GetPositionSpecificWeights()"
1805  */
1806 void 
1807 Alignment::Amino_acid_frequencies_and_transitions_from_M_state(HMM& q, char* in)
1808 {
1809   // Calculate position-dependent weights wi[k] for each i.
1810   // For calculation of weights in column i use sub-alignment
1811   // over sequences which have a *residue* in column i (no gap, no end gap)
1812   // and over columns where none of these sequences has an end gap.
1813   // This is done by updating the arrays n[j][a] at each step i-1->i while letting i run from 1 to L.
1814   // n[j][a] = number of occurences of amino acid a at column j of the subalignment,
1815   // => only columns with n[j][ENDGAP]=0 are contained in the subalignment!
1816   // If no sequences enter or leave the subalignment at the step i-1 -> i (i.e. change=0)
1817   // then the old values wi[k], Neff[i-1], and ncol are used for the new position i.
1818   // Index a can be an amino acid (0-19), ANY=20, GAP=21, or ENDGAP=22
1819
1820   int k; // index of sequence
1821   int i,j; // position in alignment
1822   int a; // amino acid (0..19)
1823   int naa; // number of different amino acids
1824   int** n; // n[j][a] = number of seq's with some residue at column i AND a at position j
1825   //float wi[MAXSEQ]; // weight of sequence k in column i, calculated from subalignment i
1826   float *wi=NULL; // weight of sequence k in column i, calculated from subalignment i
1827   //float Neff[MAXRES]; // diversity of subalignment i
1828   float *Neff = new(float[par.maxResLen]); // diversity of subalignment i
1829   int nseqi=0; // number of sequences in subalignment i
1830   int ncol=0; // number of columns j that contribute to Neff[i]
1831   char change; // has the set of sequences in subalignment changed? 0:no 1:yes
1832   float fj[NAA+3]; // to calculate entropy
1833   float sum;
1834
1835   wi = new(float[N_in+2]);
1836
1837   // Global weights?
1838   if (par.wg==1)
1839     for (k=0; k<N_in; k++) wi[k]=wg[k];
1840
1841   // Initialization
1842   q.Neff_HMM=0.0f;
1843   Neff[0]=0.0; // if the first column has no residues (i.e. change==0), Neff[i]=Neff[i-1]=Neff[0]
1844   n = new(int*[L+2]);
1845   for (j=1; j<=L; j++) n[j]=new(int[NAA+3]);
1846   for (j=1; j<=L; j++)
1847     for (a=0; a<NAA+3; a++) n[j][a]=0;
1848
1849
1850   //////////////////////////////////////////////////////////////////////////////////////////////
1851   // Main loop through alignment columns
1852   for (i=1; i<=L; i++) // Calculate wi[k] at position i as well as Neff[i]
1853     {
1854
1855       if (par.wg==0)
1856         {
1857
1858           change=0;
1859           // Check all sequences k and update n[j][a] and ri[j] if necessary
1860           for (k=0; k<N_in; k++)
1861             {
1862               if (!in[k]) continue;
1863               if (X[k][i-1]>=ANY && X[k][i]<ANY)
1864                 { // ... if sequence k was NOT included in i-1 and has to be included for column i
1865                   change=1;
1866                   nseqi++;
1867                   for (int j=1; j<=L; j++) n[j][ (int)X[k][j]]++;
1868                 }
1869               else if (X[k][i-1]<ANY && X[k][i]>=ANY)
1870                 { // ... if sequence k WAS included in i-1 and has to be thrown out for column i
1871                   change=1;
1872                   nseqi--;
1873                   for (int j=1; j<=L; j++) n[j][ (int)X[k][j]]--;
1874                 }
1875             } //end for (k)
1876           nseqs[i]=nseqi;
1877
1878           // If subalignment changed: update weights wi[k] and Neff[i]
1879           if (change)
1880             {
1881               // Initialize weights and numbers of residues for subalignment i
1882               ncol=0;
1883               for (k=0; k<N_in; k++) wi[k]=1E-8; // for pathological alignments all wi[k] can get 0; /* MR1 */
1884
1885               // sum wi[k] over all columns j and sequences k of subalignment
1886               for (j=1; j<=L; j++)
1887                 {
1888                   // do at least a fraction MAXENDGAPFRAC of sequences in subalignment contain an end gap in j?
1889                   if (n[j][ENDGAP]>MAXENDGAPFRAC*nseqi) continue;
1890                   naa=0; for (a=0; a<20; a++) if(n[j][a]) naa++;
1891                   if (naa==0) continue;
1892                   ncol++;
1893                   for (k=0; k<N_in; k++)
1894                     {
1895                       if (in[k] && X[k][i]<ANY && X[k][j]<ANY)
1896                         {
1897                           // if (!n[j][ (int)X[k][j]]) {fprintf(stderr,"Error: Mi=%i: n[%i][X[%i]]=0! (X[%i]=%i)\n",i,j,k,k,X[k][j]);}
1898                           wi[k]+=1.0/float(n[j][ (int)X[k][j] ]*naa);
1899                         }
1900                     }
1901                 }
1902
1903               // Check whether number of columns in subalignment is sufficient
1904               if (ncol<NCOLMIN)
1905                 // Take global weights
1906                 for (k=0; k<N_in; k++)
1907                   if(in[k] && X[k][i]<ANY) wi[k]=wg[k]; else wi[k]=0.0;
1908
1909               // Calculate Neff[i]
1910               Neff[i]=0.0;
1911               for (j=1; j<=L; j++)
1912                 {
1913                   // do at least a fraction MAXENDGAPFRA of sequences in subalignment contain an end gap in j?
1914                   if (n[j][ENDGAP]>MAXENDGAPFRAC*nseqi) continue;
1915                   for (a=0; a<20; a++) fj[a]=0;
1916                   for (k=0; k<N_in; k++)
1917                     if (in[k] && X[k][i]<ANY && X[k][j]<ANY)
1918                       fj[ (int)X[k][j] ]+=wi[k];
1919                   NormalizeTo1(fj,NAA);
1920                   for (a=0; a<20; a++)
1921                     if (fj[a]>1E-10) Neff[i]-=fj[a]*fast_log2(fj[a]);
1922                 }
1923               if (ncol>0) Neff[i]=pow(2.0,Neff[i]/ncol); else Neff[i]=1.0;
1924
1925             }
1926
1927           else //no update was necessary; copy values for i-1
1928             {
1929               Neff[i]=Neff[i-1];
1930             }
1931         }
1932
1933
1934       // Calculate amino acid frequencies q.f[i][a] from weights wi[k]
1935       for (a=0; a<20; a++) q.f[i][a]=0;
1936       for (k=0; k<N_in; k++) if (in[k]) q.f[i][ (int)X[k][i] ]+=wi[k];
1937       NormalizeTo1(q.f[i],NAA,pb);
1938
1939       // Calculate transition probabilities from M state
1940       q.tr[i][M2M]=q.tr[i][M2D]=q.tr[i][M2I]=0.0;
1941       for (k=0; k<N_in; k++) //for all sequences
1942         {
1943           if (!in[k]) continue;
1944           //if input alignment is local ignore transitions from and to end gaps
1945           if (X[k][i]<ANY) //current state is M
1946             {
1947               if (I[k][i]) //next state is I
1948                 q.tr[i][M2I]+=wi[k];
1949               else if (X[k][i+1]<=ANY) //next state is M
1950                 q.tr[i][M2M]+=wi[k];
1951               else if (X[k][i+1]==GAP) //next state is D
1952                 q.tr[i][M2D]+=wi[k];
1953             }
1954         } // end for(k)
1955       // Normalize and take log
1956       sum = q.tr[i][M2M]+q.tr[i][M2I]+q.tr[i][M2D]+FLT_MIN;
1957       q.tr[i][M2M]=log2(q.tr[i][M2M]/sum);
1958       q.tr[i][M2I]=log2(q.tr[i][M2I]/sum);
1959       q.tr[i][M2D]=log2(q.tr[i][M2D]/sum);
1960
1961       // for (k=0; k<N_in; k++) if (in[k]) w[k][i]=wi[k];
1962     }
1963     // DD TODO:fill in all the missing Neff values
1964
1965
1966   // end loop through alignment columns i
1967   //////////////////////////////////////////////////////////////////////////////////////////////
1968
1969   delete[](wi); wi=NULL;
1970   // delete n[][]
1971   for (j=1; j<=L; j++){
1972     delete[](n[j]); (n[j]) = NULL;
1973   }
1974   delete[](n); (n) = NULL;
1975
1976   q.tr[0][M2M]=0;
1977   q.tr[0][M2I]=-100000;
1978   q.tr[0][M2D]=-100000;
1979   q.tr[L][M2M]=0;
1980   q.tr[L][M2I]=-100000; 
1981   q.tr[L][M2D]=-100000;
1982   q.Neff_M[0]=99.999; // Neff_av[0] is used for calculation of transition pseudocounts for the start state
1983
1984   // Set emission probabilities of zero'th (begin) state and L+1st (end) state to background probabilities
1985   for (a=0; a<20; a++) q.f[0][a]=q.f[L+1][a]=pb[a];
1986
1987   // Assign Neff_M[i] and calculate average over alignment, Neff_M[0]
1988   if (par.wg==1)
1989     {
1990       for (i=1; i<=L; i++)
1991         {
1992           float sum=0.0f;
1993           for (a=0; a<20; a++)
1994             if (q.f[i][a]>1E-10) sum -= q.f[i][a]*fast_log2(q.f[i][a]);
1995           q.Neff_HMM+=pow(2.0,sum);
1996         }
1997       q.Neff_HMM/=L;
1998       float Nlim=fmax(10.0,q.Neff_HMM+1.0); // limiting Neff
1999       float scale=log2((Nlim-q.Neff_HMM)/(Nlim-1.0)); // for calculating Neff for those seqs with inserts at specific pos
2000       for (i=1; i<=L; i++)
2001         {
2002           float w_M=-1.0/N_filtered;
2003           for (k=0; k<N_in; k++)
2004             if (in[k] && X[k][i]<=ANY) w_M+=wg[k];
2005           if (w_M<0) q.Neff_M[i]=1.0;
2006           else q.Neff_M[i] = Nlim - (Nlim-1.0)*fpow2(scale*w_M);
2007           // fprintf(stderr,"M i=%3i ncol=--- Neff_M=%5.2f Nlim=%5.2f w_M=%5.3f Neff_M=%5.2f\n",i,q.Neff_HMM,Nlim,w_M,q.Neff_M[i]);
2008         }
2009     }
2010   else
2011     {
2012       for (i=1; i<=L; i++)
2013         {
2014           q.Neff_HMM+=Neff[i];
2015           q.Neff_M[i]=Neff[i];
2016       if (q.Neff_M[i] == 0) { q.Neff_M[i] = 1; } /* MR1 */
2017         }
2018       q.Neff_HMM/=L;
2019     }
2020
2021   delete[] Neff; Neff = NULL;
2022
2023   return;
2024
2025 } /* this is the end of Alignment::Amino_acid_frequencies_and_transitions_from_M_state() */
2026
2027
2028 /////////////////////////////////////////////////////////////////////////////////////
2029 /**
2030  * @brief Calculate transitions q.tr[i][a] (a=DM,DD) with pos-specific subalignments
2031  */
2032 void 
2033 Alignment::Transitions_from_I_state(HMM& q, char* in)
2034 {
2035     // Calculate position-dependent weights wi[k] for each i.
2036     // For calculation of weights in column i use sub-alignment
2037     // over sequences which have a INSERT in column i
2038     // and over columns where none of these sequences has an end gap.
2039     // This is done by calculating the arrays n[j][a] and rj[j] at each step i-1->i while letting i run from 1 to L.
2040     // n[j][a] = number of occurences of amino acid a at column j of the subalignment,
2041     // => only columns with n[j][ENDGAP]=0 are contained in the subalignment!
2042     // If no sequences enter or leave the subalignment at the step i-1 -> i (i.e. change=0)
2043     // then the old values wi[k], Neff[i-1], and ncol are used for the new position i.
2044     // Index a can be an amino acid (0-19), ANY=20, GAP=21, or ENDGAP=22
2045
2046     int k; // index of sequence
2047     int i,j; // position in alignment
2048     int a; // amino acid (0..19)
2049     int naa; // number of different amino acids
2050     int** n; // n[j][a] = number of seq's with some residue at column i AND a at position j
2051     //float wi[MAXSEQ]; // weight of sequence k in column i, calculated from subalignment i
2052     float *wi = NULL; // weight of sequence k in column i, calculated from subalignment i
2053     //float Neff[MAXRES]; // diversity of subalignment i
2054     float *Neff = new(float[par.maxResLen]); // diversity of subalignment i
2055     int nseqi; // number of sequences in subalignment i
2056     int ncol; // number of columns j that contribute to Neff[i]
2057     float fj[NAA+3]; // to calculate entropy
2058     float sum;
2059     float Nlim=0.0; // only for global weights
2060     float scale=0.0; // only for global weights
2061
2062     wi = new(float[N_in+2]);
2063
2064     // Global weights?
2065     if (par.wg==1)
2066         {
2067             for (k=0; k<N_in; k++) wi[k]=wg[k];
2068             Nlim=fmax(10.0,q.Neff_HMM+1.0); // limiting Neff
2069             scale=log2((Nlim-q.Neff_HMM)/(Nlim-1.0)); // for calculating Neff for those seqs with inserts at specific pos
2070         }
2071
2072     // Initialization
2073     n = new(int*[L+2]);
2074     for (j=1; j<=L; j++) n[j]=new(int[NAA+3]);
2075
2076     //////////////////////////////////////////////////////////////////////////////////////////////
2077     // Main loop through alignment columns
2078     for (i=1; i<=L; i++) // Calculate wi[k] at position i as well as Neff[i]
2079         {
2080             if (par.wg==0) // local weights?
2081                 {
2082
2083                     // Calculate n[j][a] and ri[j]
2084                     nseqi=0;
2085                     for (k=0; k<N_in; k++)
2086                         {
2087                             if (in[k] && I[k][i]>0)
2088                                 {
2089                                     if (nseqi==0) // Initialize only if inserts present! Otherwise O(L*L) even for single sequences!
2090                                         {
2091                                             // Initialization of n[j][a]
2092                                             for (j=1; j<=L; j++)
2093                                                 for (a=0; a<NAA+3; a++) n[j][a]=0;
2094                                         }
2095                                     nseqi++;
2096                                     for (int j=1; j<=L; j++) n[j][ (int)X[k][j]]++;
2097                                 }
2098                         } //end for (k)
2099                     nseqs[i]=nseqi;
2100
2101                     // If there is no sequence in subalignment j ...
2102                     if (nseqi==0)
2103                         {
2104                             ncol=0;
2105                             Neff[i]=0.0; // effective number of sequence = 0!
2106                             q.tr[i][I2M]=-100000;
2107                             q.tr[i][I2I]=-100000;
2108                             continue;
2109                         }
2110
2111                     // update weights wi[k] and Neff[i]
2112                     // if (1)
2113                     {
2114                         // Initialize weights and numbers of residues for subalignment i
2115                         ncol=0;
2116                         for (k=0; k<N_in; k++) wi[k]=0.0;
2117
2118                         // sum wi[k] over all columns j and sequences k of subalignment
2119                         for (j=1; j<=L; j++)
2120                             {
2121                                 if (n[j][ENDGAP]>MAXENDGAPFRAC*nseqi) continue;
2122                                 naa=0; for (a=0; a<20; a++) if(n[j][a]) naa++;
2123                                 if (naa==0) continue;
2124                                 ncol++;
2125                                 for (k=0; k<N_in; k++)
2126                                     {
2127                                         if (in[k] && I[k][i]>0 && X[k][j]<ANY)
2128                                             {
2129                                                 if (!n[j][ (int)X[k][j]]) {fprintf(stderr,"Error: Ii=%i: n[%i][X[%i]]=0! (X[%i]=%i)\n",i,j,k,k,X[k][j]);}
2130                                                 wi[k]+=1.0/float(n[j][ (int)X[k][j] ]*naa);
2131                                             }
2132                                     }
2133                             }
2134
2135                         // Check whether number of columns in subalignment is sufficient
2136                         if (ncol>=NCOLMIN)
2137                             // Take global weights
2138                             for (k=0; k<N_in; k++)
2139                                 if(in[k] && I[k][i]>0) wi[k]=wg[k]; else wi[k]=0.0;
2140
2141                         // Calculate Neff[i]
2142                         Neff[i]=0.0;
2143                         for (j=1; j<=L; j++)
2144                             {
2145                                 if (n[j][ENDGAP]>MAXENDGAPFRAC*nseqi) continue;
2146                                 for (a=0; a<20; a++) fj[a]=0;
2147                                 for (k=0; k<N_in; k++)
2148                                     if (in[k] && I[k][i]>0 && X[k][j]<ANY)
2149                                         fj[ (int)X[k][j] ]+=wi[k];
2150                                 NormalizeTo1(fj,NAA);
2151                                 for (a=0; a<20; a++)
2152                                     if (fj[a]>1E-10) Neff[i]-=fj[a]*fast_log2(fj[a]);
2153                             }
2154                         if (ncol>0) Neff[i]=pow(2.0,Neff[i]/ncol); else Neff[i]=1.0;
2155
2156                     }
2157                     // Calculate transition probabilities from I state
2158                     q.tr[i][I2M]=q.tr[i][I2I]=0.0;
2159                     for (k=0; k<N_in; k++) //for all sequences
2160                         {
2161                             if (in[k] && I[k][i]>0) //current state is I
2162                                 {
2163                                     q.tr[i][I2M]+=wi[k];
2164                                     q.tr[i][I2I]+=wi[k]*(I[k][i]-1);
2165                                 }
2166                         } // end for(k)
2167                 }
2168
2169             else // fast global weights?
2170                 {
2171                     float w_I=-1.0/N_filtered;
2172                     ncol=0;
2173                     q.tr[i][I2M]=q.tr[i][I2I]=0.0;
2174                     // Calculate amino acid frequencies fj[a] from weights wg[k]
2175                     for (k=0; k<N_in; k++)
2176                         if (in[k] && I[k][i]>0)
2177                             {
2178                                 ncol++;
2179                                 w_I+=wg[k];
2180                                 q.tr[i][I2M]+=wi[k];
2181                                 q.tr[i][I2I]+=wi[k]*(I[k][i]-1);
2182                             }
2183                     if (ncol>0)
2184                         {
2185                             if (w_I<0) Neff[i]=1.0;
2186                             else Neff[i] = Nlim - (Nlim-1.0)*fpow2(scale*w_I);
2187                             // fprintf(stderr,"I i=%3i ncol=%3i Neff_M=%5.2f Nlim=%5.2f w_I=%5.3f Neff_I=%5.2f\n",i,ncol,q.Neff_HMM,Nlim,w_I,Neff[i]);
2188                         }
2189                     else
2190                         {
2191                             Neff[i]=0.0;
2192                             q.tr[i][I2M]=-100000;
2193                             q.tr[i][I2I]=-100000;
2194                             continue;
2195                         }
2196                 }
2197
2198             // Normalize and take log
2199             sum = q.tr[i][I2M]+q.tr[i][I2I];
2200             q.tr[i][I2M]=log2(q.tr[i][I2M]/sum);
2201             q.tr[i][I2I]=log2(q.tr[i][I2I]/sum);
2202
2203         }
2204     // end loop through alignment columns i
2205     //////////////////////////////////////////////////////////////////////////////////////////////
2206
2207     delete[](wi); wi = NULL;
2208     // delete n[][]
2209     for (j=1; j<=L; j++){
2210         delete[](n[j]); (n[j]) = NULL;
2211     }
2212     delete[](n); (n) = NULL;
2213
2214     q.tr[0][I2M]=0;
2215     q.tr[0][I2I]=-100000;
2216     q.tr[L][I2M]=0;
2217     q.tr[L][I2I]=-100000;
2218     q.Neff_I[0]=99.999;
2219
2220     // Assign Neff_I[i]
2221     for (i=1; i<=L; i++) // Calculate wi[k] at position i as well as Neff[i] and Neff[i]
2222         q.Neff_I[i]=Neff[i];
2223
2224     delete[] Neff; Neff = NULL;
2225     return;
2226
2227 } /* this is the end of Alignment::Transitions_from_I_state() */
2228
2229
2230 /////////////////////////////////////////////////////////////////////////////////////
2231 /**
2232  * @brief Calculate transitions q.tr[i][a] (a=DM,DD) with pos-specific subalignments
2233  */
2234 void 
2235 Alignment::Transitions_from_D_state(HMM& q, char* in)
2236 {
2237     // Calculate position-dependent weights wi[k] for each i.
2238     // For calculation of weights in column i use sub-alignment
2239     // over sequences which have a DELETE in column i
2240     // and over columns where none of these sequences has an end gap.
2241     // This is done by updating the arrays n[j][a] and rj[j] at each step i-1->i while letting i run from 1 to L.
2242     // n[j][a] = number of occurences of index a at column j of the subalignment,
2243     // => only columns with n[j][ENDGAP]=0 are contained in the subalignment!
2244     // If no sequences enter or leave the subalignment at the step i-1 -> i (i.e. change=0)
2245     // then the old values wi[k], Neff[i-1], and ncol are used for the new position i.
2246     // Index a can be an amino acid (0-19), ANY=20, GAP=21, or ENDGAP=22
2247
2248     int k; // index of sequence
2249     int i,j; // position in alignment
2250     int a; // amino acid (0..19)
2251     int naa; // number of different amino acids
2252     int** n; // n[j][a] = number of seq's with some residue at column i AND a at position j
2253     //float wi[MAXSEQ]; // weight of sequence k in column i, calculated from subalignment i
2254     float *wi=NULL; // weight of sequence k in column i, calculated from subalignment i
2255     //float Neff[MAXRES]; // diversity of subalignment i 
2256     float *Neff = new(float[par.maxResLen]); // diversity of subalignment i 
2257     int nseqi=0; // number of sequences in subalignment i (for DEBUGGING)
2258     int ncol=0; // number of columns j that contribute to Neff[i]
2259     char change; // has the set of sequences in subalignment changed? 0:no 1:yes
2260     float fj[NAA+3]; // to calculate entropy
2261     float sum;
2262     float Nlim=0.0; // only for global weights
2263     float scale=0.0; // only for global weights
2264
2265     wi = new(float[N_in+2]); /* FIXME: FS */
2266     // Global weights?
2267     if (par.wg==1)
2268         {
2269             for (k=0; k<N_in; k++) wi[k]=wg[k];
2270             Nlim=fmax(10.0,q.Neff_HMM+1.0); // limiting Neff
2271             scale=log2((Nlim-q.Neff_HMM)/(Nlim-1.0)); // for calculating Neff for those seqs with dels at specific pos
2272         }
2273
2274     // Initialization
2275     n = new(int*[L+2]);
2276     for (j=1; j<=L; j++) n[j]=new(int[NAA+3]);
2277     for (j=1; j<=L; j++)
2278         for (a=0; a<NAA+3; a++) n[j][a]=0;
2279
2280
2281
2282     //////////////////////////////////////////////////////////////////////////////////////////////
2283     // Main loop through alignment columns
2284     for (i=1; i<=L; i++) // Calculate wi[k] at position i as well as Neff[i]
2285         {
2286             if (par.wg==0) // if local weights
2287                 {
2288                     change=0;
2289                     // Check all sequences k and update n[j][a] and ri[j] if necessary
2290                     for (k=0; k<N_in; k++)
2291                         {
2292                             if (!in[k]) continue;
2293                             if (X[k][i-1]!=GAP && X[k][i]==GAP)
2294                                 { // ... if sequence k was NOT included in i-1 and has to be included for column i
2295                                     change=1;
2296                                     nseqi++;
2297                                     for (int j=1; j<=L; j++) n[j][ (int)X[k][j]]++;
2298                                 }
2299                             else if (X[k][i-1]==GAP && X[k][i]!=GAP)
2300                                 { // ... if sequence k WAS included in i-1 and has to be thrown out for column i
2301                                     change=1;
2302                                     nseqi--;
2303                                     for (int j=1; j<=L; j++) n[j][ (int)X[k][j]]--;
2304                                 }
2305                         } //end for (k)
2306                     nseqs[i]=nseqi;
2307
2308                     // If there is no sequence in subalignment j ...
2309                     if (nseqi==0)
2310                         {
2311                             ncol=0;
2312                             Neff[i]=0.0; // effective number of sequences = 0!
2313                             q.tr[i][D2M]=-100000;
2314                             q.tr[i][D2D]=-100000;
2315                             continue;
2316                         }
2317
2318                     // If subalignment changed: update weights wi[k] and Neff[i]
2319                     if (change)
2320                         {
2321                             // Initialize weights and numbers of residues for subalignment i
2322                             ncol=0;
2323                             for (k=0; k<N_in; k++) wi[k]=0.0;
2324
2325                             // sum wg[k][i] over all columns j and sequences k of subalignment
2326                             for (j=1; j<=L; j++)
2327                                 {
2328                                     if (n[j][ENDGAP]>MAXENDGAPFRAC*nseqi) continue;
2329                                     naa=0; for (a=0; a<20; a++) if(n[j][a]) naa++;
2330                                     if (naa==0) continue;
2331                                     ncol++;
2332                                     for (k=0; k<N_in; k++)
2333                                         {
2334                                             if (in[k] && X[k][i]==GAP && X[k][j]<ANY)
2335                                                 {
2336                                                     if (!n[j][ (int)X[k][j]]) {fprintf(stderr,"Error: Di=%i: n[%i][X[%i]]=0! (X[%i]=%i)\n",i,j,k,k,X[k][j]);}
2337                                                     wi[k]+=1.0/float(n[j][ (int)X[k][j] ]*naa);
2338                                                 }
2339                                         }
2340                                 }
2341
2342                             // Check whether number of columns in subalignment is sufficient
2343                             if (ncol<NCOLMIN)
2344                                 // Take global weights
2345                                 for (k=0; k<N_in; k++)
2346                                     if(in[k] && X[k][i]==GAP) wi[k]=wg[k]; else wi[k]=0.0;
2347
2348                             // Calculate Neff[i]
2349                             Neff[i]=0.0;
2350                             for (j=1; j<=L; j++)
2351                                 {
2352                                     if (n[j][ENDGAP]>MAXENDGAPFRAC*nseqi) continue;
2353                                     for (a=0; a<20; a++) fj[a]=0;
2354                                     for (k=0; k<N_in; k++)
2355                                         if (in[k] && X[k][i]==GAP && X[k][j]<ANY)
2356                                             fj[ (int)X[k][j] ]+=wi[k];
2357                                     NormalizeTo1(fj,NAA);
2358                                     for (a=0; a<20; a++)
2359                                         if (fj[a]>1E-10) Neff[i]-=fj[a]*fast_log2(fj[a]);
2360                                 }
2361                             if (ncol>0) Neff[i]=pow(2.0,Neff[i]/ncol); else Neff[i]=1.0;
2362
2363                         }
2364
2365                     else //no update was necessary; copy values for i-1
2366                         {
2367                             Neff[i]=Neff[i-1];
2368                         }
2369
2370                     // Calculate transition probabilities from D state
2371                     q.tr[i][D2M]=q.tr[i][D2D]=0.0;
2372                     for (k=0; k<N_in; k++) //for all sequences
2373                         {
2374                             if (in[k] && X[k][i]==GAP) //current state is D
2375                                 {
2376                                     if (X[k][i+1]==GAP) //next state is D
2377                                         q.tr[i][D2D]+=wi[k];
2378                                     else if (X[k][i+1]<=ANY) //next state is M
2379                                         q.tr[i][D2M]+=wi[k];
2380                                 }
2381                         } // end for(k)
2382                 }
2383
2384             else // fast global weights?
2385                 {
2386                     float w_D=-1.0/N_filtered;
2387                     ncol=0;
2388                     q.tr[i][D2M]=q.tr[i][D2D]=0.0;
2389                     // Calculate amino acid frequencies fj[a] from weights wg[k]
2390                     for (k=0; k<N_in; k++) //for all sequences
2391                         if (in[k] && X[k][i]==GAP) //current state is D
2392                             {
2393                                 ncol++;
2394                                 w_D+=wg[k];
2395                                 if (X[k][i+1]==GAP) //next state is D
2396                                     q.tr[i][D2D]+=wi[k];
2397                                 else if (X[k][i+1]<=ANY) //next state is M
2398                                     q.tr[i][D2M]+=wi[k];
2399                             }
2400                     if (ncol>0)
2401                         {
2402                             if (w_D<0) Neff[i]=1.0;
2403                             else Neff[i] = Nlim - (Nlim-1.0)*fpow2(scale*w_D);
2404                             // fprintf(stderr,"D i=%3i ncol=%3i Neff_M=%5.2f Nlim=%5.2f w_D=%5.3f Neff_D=%5.2f\n",i,ncol,q.Neff_HMM,Nlim,w_D,Neff[i]);
2405                         }
2406                     else
2407                         {
2408                             Neff[i]=0.0; // effective number of sequences = 0!
2409                             q.tr[i][D2M]=-100000;
2410                             q.tr[i][D2D]=-100000;
2411                             continue;
2412                         }
2413                 }
2414
2415             // Normalize and take log
2416             sum = q.tr[i][D2M]+q.tr[i][D2D];
2417             q.tr[i][D2M]=log2(q.tr[i][D2M]/sum);
2418             q.tr[i][D2D]=log2(q.tr[i][D2D]/sum);
2419
2420         }
2421     // end loop through alignment columns i
2422     //////////////////////////////////////////////////////////////////////////////////////////////
2423
2424     q.tr[0][D2M]=0;
2425     q.tr[0][D2D]=-100000;
2426     q.Neff_D[0]=99.999;
2427
2428     // Assign Neff_D[i]
2429     for (i=1; i<=L; i++)
2430         q.Neff_D[i]=Neff[i];
2431
2432     delete[](wi); wi = NULL;/* FIXME: FS */
2433     // delete n[][]
2434     for (j=1; j<=L; j++){
2435         delete[](n[j]); (n[j]) = NULL;
2436     }
2437     delete[](n); (n) = NULL;
2438
2439     delete[] Neff; Neff = NULL;
2440     return;
2441
2442 } /* this is the end of Alignment::Transitions_from_D_state() */
2443
2444
2445
2446 /////////////////////////////////////////////////////////////////////////////////////
2447 /**
2448  * @brief Write alignment without insert states (lower case) to alignment file?
2449  */
2450 void 
2451 Alignment::WriteWithoutInsertsToFile(char* alnfile)
2452 {
2453     if (v>=2) cout<<"Writing alignment to "<<alnfile<<"\n";
2454     FILE* alnf;
2455     if (!par.append) alnf = fopen(alnfile,"w"); else alnf = fopen(alnfile,"a");
2456     if (!alnf) OpenFileError(alnfile);
2457     // If alignment name is different from that of query: write name into commentary line
2458     if (strncmp(longname,sname[kfirst],DESCLEN-1)) fprintf(alnf,"#%s\n",longname);
2459     if (v>=2) cout<<"Writing alignment to "<<alnfile<<"\n";
2460     for (int k=0; k<N_in; k++)
2461         if (keep[k] || display[k]==KEEP_ALWAYS) // print if either in profile (keep[k]>0) or display is obligatory (display[k]==2)
2462             {
2463                 fprintf(alnf,">%s\n",sname[k]);
2464                 for (int i=1; i<=L; i++) fprintf(alnf,"%c",i2aa(X[k][i]));
2465                 fprintf(alnf,"\n");
2466             }
2467     fclose(alnf);
2468 }
2469
2470 /////////////////////////////////////////////////////////////////////////////////////
2471 // Write stored,filtered sequences WITH insert states (lower case) to alignment file?
2472 /////////////////////////////////////////////////////////////////////////////////////
2473 void Alignment::WriteToFile(char* alnfile, const char format[])
2474 {
2475     FILE* alnf;
2476     if (!par.append) alnf = fopen(alnfile,"w"); else alnf = fopen(alnfile,"a");
2477     if (!alnf) OpenFileError(alnfile);
2478     // If alignment name is different from that of query: write name into commentary line
2479     if (strncmp(longname,sname[kfirst],DESCLEN-1)) fprintf(alnf,"#%s\n",longname);
2480     if (!format || !strcmp(format,"a3m"))
2481         {
2482             if (v>=2) cout<<"Writing A3M alignment to "<<alnfile<<"\n";
2483             for (int k=0; k<N_in; k++)
2484                 if (keep[k] || display[k]==KEEP_ALWAYS) // print if either in profile (keep[k]>0) or display obligatory (display[k]==2)
2485                     fprintf(alnf,">%s\n%s\n",sname[k],seq[k]+1);
2486         }
2487     else // PSI-BLAST format
2488         {
2489             if (v>=2) cout<<"Writing PSI-BLAST-formatted alignment to "<<alnfile<<"\n";
2490             for (int k=kfirst; k<N_in; k++) // skip sequences before kfirst!!
2491                 if (keep[k] || display[k]==KEEP_ALWAYS) // print if either in profile (keep[k]>0) or display obligatory (display[k]==2)
2492                     {
2493                         strcut(sname[k]);
2494                         fprintf(alnf,"%-20.20s ",sname[k]);
2495                         // for (int i=1; i<=L; i++) fprintf(alnf,"%c",i2aa(X[k][i]));
2496                         // fprintf(alnf,"\n");
2497                         char* ptr=seq[k];
2498                         for (; *ptr!='\0'; ptr++)
2499                             if (*ptr==45 || (*ptr>=65 && *ptr<=90)) fprintf(alnf,"%c",*ptr);
2500                         fprintf(alnf,"\n");
2501                     }
2502         }
2503
2504     fclose(alnf);
2505 }
2506
2507
2508
2509 /* 
2510  * FIXME: this function contains a reference to MAXSEQ & MAXCOL
2511  * however, this may not be accessed  (FS) 
2512  */
2513 /////////////////////////////////////////////////////////////////////////////////////
2514 /**
2515  * @brief Read a3m slave alignment of hit from file and merge into (query) master alignment
2516  */
2517 void 
2518 Alignment::MergeMasterSlave(Hit& hit, char ta3mfile[])
2519 {
2520  Alignment Tali;
2521  char* cur_seq = new(char[MAXCOL]); // Sequence currently read in
2522  int maxcol=MAXCOL;
2523  int l,ll; // position in unaligned template (T) sequence Tali.seq[l]
2524  int i; // counts match states in query (Q) HMM
2525  int j; // counts match states in T sequence Tali.seq[l]
2526  int h; // position in aligned T sequence cur_seq[h]
2527  int k; // sequence index
2528  char c; //
2529  printf("****************%s:%s:%d: did get into MergeMasterSlave\n", __FUNCTION__, __FILE__, __LINE__);
2530  if (v>=3) printf("Merging %s to query alignment\n",ta3mfile);
2531
2532  // If par.append==1 do not print query alignment
2533  if (par.append) for (k=0; k<N_in; k++) keep[k]=display[k]=KEEP_NOT;
2534
2535  // Read template alignment into Tali
2536  FILE* ta3mf=fopen(ta3mfile,"r");
2537  if (!ta3mf) OpenFileError(ta3mfile);
2538  Tali.Read(ta3mf,ta3mfile);
2539  fclose(ta3mf);
2540
2541  // Filter Tali alignment
2542  Tali.Compress(ta3mfile);
2543  N_filtered = Tali.Filter(par.max_seqid,par.coverage,par.qid,par.qsc,par.Ndiff);
2544
2545  // Record imatch[j]
2546  int* imatch=new(int[hit.j2+1]);
2547  int step = hit.nsteps;
2548  for (j=hit.j1; j<=hit.j2; j++)
2549  {
2550  // Advance to position of next T match state j
2551  while (hit.j[step]<j) step--;
2552  imatch[j] = hit.i[step];
2553 // printf("step=%-3i i=%-3i j=%-3i\n",step,imatch[j],j);
2554  }
2555
2556  // Determine number of match states of Qali
2557  for (L=0,l=1; seq[kfirst][l]>'\0'; l++)
2558  if ((seq[kfirst][l]>='A' && seq[kfirst][l]<='Z') || seq[kfirst][l]=='-') L++;
2559
2560  // For each sequence in T alignment: align to Qali
2561  for (k=0; k<Tali.N_in; k++)
2562  {
2563  if (!Tali.keep[k]) continue;
2564  if (N_in>=MAXSEQ)
2565  {
2566  fprintf(stderr,"WARNING in %s: maximum number of %i sequences exceeded while reading %s. Skipping all following sequences\n",program_name,MAXSEQ,ta3mfile);
2567  break;
2568  }
2569  cur_seq[0]=' '; // 0'th position not used
2570
2571  // Add the hit.i1-1 left end gaps to aligned sequence
2572  for (h=1; h<hit.i1; h++) cur_seq[h]='-';
2573
2574  // Advance to match state hit.j1 of Tali.seq[k]
2575  for (j=0, l=1; (c=Tali.seq[k][l])>'\0'; l++)
2576  if ((c>='A' && c<='Z') || c=='-') // match state at position l?
2577  if ((++j)==hit.j1) break; // yes: increment j. Reached hit,j1? yes: break
2578
2579  if (j<hit.j1)
2580  {printf("Error: did not find %i match states in sequence %i of %s. Sequence:\n%s\n",hit.j1,k,Tali.name,Tali.seq[k]); exit(1);}
2581
2582  // Write first match state to cur_seq
2583  int iprev=hit.i1; // index of previous query match state
2584  int lprev=l; // previous T match state in Tali.seq[k][l]
2585  cur_seq[h++] = Tali.seq[k][l]; // first column of alignment is Match-Match state
2586
2587  // For each further match state j in alignment
2588  step = hit.nsteps;
2589  for (j=hit.j1+1; j<=hit.j2; j++)
2590  {
2591  // Advance to position of next T match state j
2592  i=imatch[j];
2593
2594  // Advance to position of next T match state j
2595  while ((c=Tali.seq[k][++l])>'\0' && ((c>='a' && c<='z') || c=='.')) ;
2596
2597  int di=i-iprev; // number of Match states in Q between T match state j-1 and j
2598  int dl=l-lprev; // 1 + number of inserted residues in T sequence between T match state j-1 and j
2599  if (di==1)
2600  {
2601  // One Q match state for one T match state (treated as special case for speed reasons)
2602  // i: i-1 i di=1
2603  // Q: XXXXXX.....XXXXXX
2604  // T: YYYYYYyyyyyYYYYYY
2605  // j: j-1 j
2606  // l: lprev l dl=6
2607
2608  // Inserts in lower case
2609  for (ll=lprev+1; ll<l; ll++)
2610  if (Tali.seq[k][ll]!='-' && Tali.seq[k][ll]!='.') cur_seq[h++] = lwrchr(Tali.seq[k][ll]);
2611
2612  // Template Match state -> upper case
2613  cur_seq[h++] = Tali.seq[k][ll];
2614  }
2615  else if (di==0)
2616  {
2617  // Gap in query: no Q match state for on T match state (special case for speed reasons)
2618  // i: i-1 i-1 di=0
2619  // Q: XXXXXX.....~~~XXX
2620  // T: YYYYYYyyyyyYYYYYY
2621  // j: j-1 j
2622  // l: lprev l dl=6
2623
2624  // All T residues (including T match state) in lower case
2625  for (ll=lprev+1; ll<=l; ll++)
2626  if (Tali.seq[k][ll]!='-' && Tali.seq[k][ll]!='.') cur_seq[h++] = lwrchr(Tali.seq[k][ll]);
2627  }
2628  else if (di>=dl)
2629  {
2630  // More Match states in Q than Inserts in the T sequence
2631  // => half T inserts y left, half right-aligned in uc, gaps to fill up
2632  // Number of T insert residues to be left-aligned: (int)(dl/2)
2633  // i: iprev i di=7
2634  // Q: XXXXXXXXXXXXXXXXXX
2635  // T: YYYYYYYyyy-yyYYYYY
2636  // j: j-1 j
2637  // l: lprev l dl=6
2638
2639  // Add left-bounded template residues
2640  for (ll=lprev+1; ll<=lprev+(int)(dl/2); ll++)
2641  cur_seq[h++]=uprchr(Tali.seq[k][ll]);
2642
2643  // Add central gaps
2644  for (int gap=1; gap<=di-dl; gap++) cur_seq[h++]='-';
2645
2646  // Add right-bounded residues
2647  for (; ll<=l; ll++)
2648  cur_seq[h++]=uprchr(Tali.seq[k][ll]);
2649  }
2650  else if (di<dl)
2651  {
2652  // Fewer Match states in Q than inserts in T sequence
2653  // => half of available space di for left- half for right-aligned T inserts, rest in lc
2654  // number of T inserts to be left-aligned in uc: (int)(di/2),
2655  // i: iprev i di=5
2656  // Q: XXXXXXXXX.XXXXXXX
2657  // T: YYYYYYYyyyyyYYYYY
2658  // j: j-1 j
2659  // l: lprev l dl=6
2660
2661  // Add left-bounded template residues
2662  for (ll=lprev+1; ll<=lprev+(int)(di/2); ll++)
2663  cur_seq[h++]=uprchr(Tali.seq[k][ll]);
2664
2665  // Add central inserts
2666  for (int ins=1; ins<=dl-di; ins++,ll++)
2667  if (Tali.seq[k][ll]!='-' && Tali.seq[k][ll]!='.') cur_seq[h++] = lwrchr(Tali.seq[k][ll]);
2668
2669  // Add right-bounded residues
2670  for (; ll<=l; ll++)
2671  cur_seq[h++]=uprchr(Tali.seq[k][ll]);
2672  }
2673 // printf("i=%-3i j=%-3i l=%-3i cur_seq=%s\n",i,j,l,cur_seq);
2674
2675  iprev=i; lprev=l;
2676  if (h>=maxcol-1000) // too few columns? Reserve double space
2677  {
2678  char* new_seq=new(char[2*maxcol]);
2679  strncpy(new_seq,cur_seq,maxcol); //////// check: maxcol-1 ????
2680  delete[](cur_seq); (cur_seq) = NULL;
2681  cur_seq=new_seq;
2682  maxcol*=2;
2683  }
2684  }
2685
2686  // Add the remaining gaps '-' to the end of the template sequence
2687  for (i=hit.i2+1; i<=L; i++) cur_seq[h++]='-';
2688  cur_seq[h++]='\0';
2689
2690  keep[N_in] = display[N_in] = KEEP_CONDITIONALLY;
2691  seq[N_in]=new(char[h]);
2692  if (!seq[N_in]) MemoryError("array for input sequences");
2693  strcpy(seq[N_in],cur_seq);
2694  X[N_in]=new(char[h]);
2695  if (!X[N_in]) MemoryError("array for input sequences");
2696  I[N_in]=new(short unsigned int[h]);
2697  if (!I[N_in]) MemoryError("array for input sequences");
2698  sname[N_in]=new(char[strlen(Tali.sname[k])+1]);
2699  if (!sname[N_in]) MemoryError("array for input sequences");
2700  strcpy(sname[N_in],Tali.sname[k]);
2701  N_in++;
2702
2703 // printf("k=%-3i %s\n",k,Tali.seq[k]);
2704 // printf("Query %s\n",seq[kfirst]);
2705 // printf("k=%-3i %s\n\n",k,cur_seq);
2706
2707  } // end for (k)
2708
2709 // printf("N_in=%-5i HMM=%s with %i sequences\n",N_in,ta3mfile,N_filtered);
2710
2711  delete[] cur_seq; cur_seq = NULL;
2712  delete[] imatch; imatch = NULL;
2713  delete[] ksort; ksort=NULL; // if ksort already existed it will be to short for merged alignment
2714  delete[] first; first=NULL; // if first already existed it will be to short for merged alignment
2715  delete[] last; last=NULL; // if last already existed it will be to short for merged alignment
2716
2717 } /* this is the end of Alignment::MergeMasterSlave() */
2718
2719
2720 /////////////////////////////////////////////////////////////////////////////////////
2721 /**
2722  * @brief Add a sequence to Qali
2723  */
2724 void 
2725 Alignment::AddSequence(char Xk[], int Ik[])
2726 {
2727  int i; // position in query and target
2728  if (L<=0) InternalError("L is not set in AddSequence()");
2729  X[N_in]=new(char[L+2]);
2730  for (i=0; i<=L+1; i++) X[N_in][i]=Xk[i];
2731  if (Ik==NULL)
2732  for (i=0; i<=L+1; i++) I[N_in][i]=0;
2733  else
2734  for (i=0; i<=L+1; i++) I[N_in][i]=Ik[i];
2735  N_in++;
2736 }
2737
2738
2739 /////////////////////////////////////////////////////////////////////////////////////
2740 /**
2741  * @brief Determine matrix of position-specific weights w[k][i] for multiple alignment
2742  * Pos-specific weights are calculated like in "Amino_acid_frequencies_and_transitions_from_M_state()"
2743  */
2744 void 
2745 Alignment::GetPositionSpecificWeights(float* w[])
2746 {
2747  // Calculate position-dependent weights wi[k] for each i.
2748  // For calculation of weights in column i use sub-alignment
2749  // over sequences which have a *residue* in column i (no gap, no end gap)
2750  // and over columns where none of these sequences has an end gap.
2751  // This is done by updating the arrays n[j][a] at each step i-1->i while letting i run from 1 to L.
2752  // n[j][a] = number of occurences of amino acid a at column j of the subalignment,
2753  // => only columns with n[j][ENDGAP]=0 are contained in the subalignment!
2754  // If no sequences enter or leave the subalignment at the step i-1 -> i (i.e. change=0)
2755  // then the old values w[k][i] and ncol are used for the new position i.
2756  // Index a can be an amino acid (0-19), ANY=20, GAP=21, or ENDGAP=22
2757
2758  char* in=keep; // to keep the code similar to Amino_acid_frequencies_and_transitions_from_M_state()
2759  int k; // index of sequence
2760  int i,j; // position in alignment
2761  int a; // amino acid (0..19)
2762  int naa; // number of different amino acids
2763  int** n; // n[j][a] = number of seq's with some residue at column i AND a at position j
2764  int nseqi=0; // number of sequences in subalignment i
2765  int ncol=0; // number of columns j that contribute to Neff[i]
2766  char change; // has the set of sequences in subalignment changed? 0:no 1:yes
2767
2768
2769  // Global weights?
2770  if (par.wg==1)
2771  {
2772  for (k=0; k<N_in; k++)
2773  for (i=1; i<=L; i++) w[k][i]=wg[k];
2774  }
2775  else
2776  {
2777
2778  // Initialization
2779  n = new(int*[L+2]);
2780  for (j=1; j<=L; j++) n[j]=new(int[NAA+3]);
2781  for (j=1; j<=L; j++)
2782  for (a=0; a<NAA+3; a++) n[j][a]=0;
2783
2784  //////////////////////////////////////////////////////////////////////////////////////////////
2785  // Main loop through alignment columns
2786  for (i=1; i<=L; i++) // Calculate w[k][i]
2787  {
2788  change=0;
2789  // Check all sequences k and update n[j][a] and ri[j] if necessary
2790  for (k=0; k<N_in; k++)
2791  {
2792  if (!in[k]) continue;
2793  if (X[k][i-1]>=ANY && X[k][i]<ANY)
2794  { // ... if sequence k was NOT included in i-1 and has to be included for column i
2795  change=1;
2796  nseqi++;
2797  for (int j=1; j<=L; j++) n[j][ (int)X[k][j]]++;
2798  }
2799  else if (X[k][i-1]<ANY && X[k][i]>=ANY)
2800  { // ... if sequence k WAS included in i-1 and has to be thrown out for column i
2801  change=1;
2802  nseqi--;
2803  for (int j=1; j<=L; j++) n[j][ (int)X[k][j]]--;
2804  }
2805  } //end for (k)
2806  nseqs[i]=nseqi;
2807
2808  // If subalignment changed: update weights w[k][i] and Neff[i]
2809  if (change)
2810  {
2811  // Initialize weights and numbers of residues for subalignment i
2812  ncol=0;
2813  for (k=0; k<N_in; k++) w[k][i]=0.0;
2814
2815  // sum wi[k] over all columns j and sequences k of subalignment
2816  for (j=1; j<=L; j++)
2817  {
2818  // do at least a fraction MAXENDGAPFRAC of sequences in subalignment contain an end gap in j?
2819  if (n[j][ENDGAP]>MAXENDGAPFRAC*nseqi) continue;
2820  naa=0; for (a=0; a<20; a++) if(n[j][a]) naa++;
2821  if (naa==0) continue;
2822  ncol++;
2823  for (k=0; k<N_in; k++)
2824  {
2825  if (in[k] && X[k][i]<ANY && X[k][j]<ANY)
2826  {
2827 // if (!n[j][ (int)X[k][j]]) {fprintf(stderr,"Error: Mi=%i: n[%i][X[%i]]=0! (X[%i]=%i)\n",i,j,k,k,X[k][j]);}
2828  w[k][i]+=1.0/float(n[j][ (int)X[k][j] ]*naa);
2829  }
2830  }
2831  }
2832
2833  // Check whether number of columns in subalignment is sufficient
2834  if (ncol<NCOLMIN)
2835  // Take global weights
2836  for (k=0; k<N_in; k++)
2837  if(in[k]) {if(X[k][i]<ANY) w[k][i]=wg[k]; else w[k][i]=0.0;}
2838  }
2839  }
2840  // end loop through alignment columns i
2841  ///////////////////////////////////////////////////////////////////////
2842
2843  // delete n[][]
2844  for (j=1; j<=L; j++){
2845  delete[](n[j]); (n[j]) = NULL;
2846  }
2847  delete[](n); (n) = NULL;
2848
2849  }
2850  return;
2851 }
2852
2853 #ifdef CLUSTALO
2854 /* @* Transfer
2855  *
2856  * take sequence data from Clustal and transfer it into
2857  * hhalign accessible information (structure/class)
2858  *
2859  * Note that hhalign does not see all sequences/profiles
2860  * but only sequences that are elements of the 2 profiles
2861  * to be aligned.
2862  *
2863  * References to the required sequences are passed into hhalign
2864  * through auxilliary pointers that are shallow copies of the
2865  * sequence/profile data available to Clustal.
2866  *
2867  * Re-allocating memory for these auxilliary pointers
2868  * would be desaterous, as it might detach the memory
2869  * seen by Clustal.
2870  */
2871 void 
2872 Alignment::Transfer(char **ppcProf, int iCnt){
2873
2874     /* @<variables local to Transfer@> */
2875     int iLen; /* length of profile */
2876     int k; /* generic iterator */
2877
2878     /* @<initialisation@> */
2879     N_in = iCnt;
2880     N_filtered = N_ss = 0;
2881     kss_dssp = ksa_dssp = kss_pred = kss_conf = -1;
2882     kfirst = 0;
2883     strcpy(longname, "unknown_long_seq_name");
2884     strcpy(name, "unknown_seq_name");
2885     strcpy(file, "unknown_file_name");
2886     n_display = iCnt;
2887
2888     /* @<determine length of profile@>
2889        all sequences in profile should have same length,
2890        so only do it for 1st */
2891     for (iLen = 0; '\0' != ppcProf[0][iLen]; iLen++);
2892
2893     /* @<allocate memory for sequences etc@> */
2894     for (k = 0; k < iCnt; k++){
2895 #define GOOD_MEASURE 1000 /* Temporary -- can be removed once rest in place */
2896         I[k] = new(short unsigned int[iLen+2+GOOD_MEASURE]);
2897         X[k] = new(char[iLen+2+GOOD_MEASURE]);
2898         seq[k] = new(char[iLen+2+GOOD_MEASURE]);
2899         seq[k][0] = ' ';
2900         seq[k][1] = '\0';
2901         if (NULL == ppcProf[k]){
2902             printf("%s:%d: Arena[%d]=NULL, cnt=%d\n", __FILE__, __LINE__, k, iCnt);
2903             exit(-1);
2904         }
2905         strcat(seq[k], ppcProf[k]);
2906         keep[k] = KEEP_CONDITIONALLY;
2907         display[k] = KEEP_CONDITIONALLY;
2908         sname[k] = new(char[GOOD_MEASURE]);
2909         strcpy(sname[k], "unknown_sname");
2910     } /* (0 <= k < iCnt) */
2911     /* FIXME: Soeding always makes 1st sequence permanent */
2912     /*keep[0] = KEEP_ALWAYS;
2913       display[k] = KEEP_ALWAYS;*/
2914 #if 1
2915     /* Believe that the first and last positions are
2916        most important in stability of this algorithm.
2917        Must make sure that at least 2 sequences with
2918        residues in these positions are kept.
2919        Think any sequence will do, but better to keep
2920        the one with the longest 'contig'
2921     */
2922     int iSeq; /* sequence iterator */
2923     int iHeadLen = 0, iHeadID = -1; /* length & ID of longest head contig */
2924     int iTailLen = 0, iTailID = -1; /* length & ID of longest head contig */
2925     int iCont = -1;
2926     char *pcFind = NULL;
2927
2928 #if 0
2929     printf("%s:%s:%d: NEW PROFILE (%d seq) ================\n",
2930            __FUNCTION__, __FILE__, __LINE__, iCnt);
2931 #endif
2932     for (iSeq = 0; iSeq < iCnt; iSeq++){
2933 #if 0
2934         printf("%s:%s:%d: consider seq %d ------------------\n",
2935                __FUNCTION__, __FILE__, __LINE__, iSeq);
2936 #endif
2937         pcFind = strchr(&seq[iSeq][1], '-');
2938         if (NULL == pcFind){
2939             /* no gap at all in this sequences, spans entire profile */
2940             iHeadID = iTailID = iSeq;
2941             iHeadLen = iTailLen = iLen;
2942             break;
2943         }
2944         iCont = (int)(pcFind - &seq[iSeq][1]);
2945         if (iCont > iHeadLen){
2946             iHeadLen = iCont;
2947             iHeadID  = iSeq;
2948         }
2949         pcFind = strrchr(seq[iSeq], '-');
2950         iCont = iLen - (int)(pcFind - seq[iSeq]);
2951         if (iCont > iTailLen){
2952             iTailLen = iCont;
2953             iTailID  = iSeq;
2954         }
2955
2956 #if 0
2957         printf("%s:%s:%d: seq %3d: len = %d(%d) %s\n",
2958                __FUNCTION__, __FILE__, __LINE__, iSeq, iCont, iLen, seq[iSeq]);
2959 #endif
2960     } /* 0 <= iSeq < iCnt */
2961 #if 0
2962     printf("%s:%s:%d: seq %d is winner with head contig of %d, seq %d tail contig of %d\n"
2963            , __FUNCTION__, __FILE__, __LINE__, iHeadID, iHeadLen, iTailID, iTailLen);
2964 #endif
2965     if ( (-1 == iHeadID) || (-1 == iTailID) ){
2966         printf("%s:%s:%d: profile has no leading and/or trailing residues (h=%d:t=%d:#=%d)\n",
2967                __FUNCTION__, __FILE__, __LINE__, iHeadID, iTailID, iCnt);
2968     }
2969     else{
2970         keep[iHeadID] = KEEP_ALWAYS;
2971         keep[iTailID] = KEEP_ALWAYS;
2972     }
2973 #endif
2974     /* @= */
2975     return;
2976
2977 } /* this is the end of Transfer() */
2978 #endif
2979
2980 #ifdef CLUSTALO
2981 /* @* Alignment::ClobberGlobal (eg: qali)
2982  *
2983  * Note: originally hhalign() was stand-alone code,
2984  * there are a couple of GLOBAL (!) variables,
2985  * which would have been destroyed on exit.
2986  * However, now there is no 'exit' from hhalign(),
2987  * and on re-entry the global variable must be clean again.
2988  */
2989 void 
2990 Alignment::ClobberGlobal(void){
2991
2992  /* @<essentials@>
2993  these are essential to re-set (as some of them are used as flags) */
2994  for(int k=0; k<N_in; k++)
2995  {
2996  delete[] sname[k]; sname[k] = NULL;
2997  delete[] seq[k]; seq[k] = NULL;
2998  delete[] X[k]; X[k] = NULL;
2999  delete[] I[k]; I[k] = NULL;
3000  }
3001  delete[] nres; nres = NULL;
3002  delete[] first; first = NULL;
3003  delete[] last; last = NULL;
3004  delete[] ksort; ksort = NULL;
3005  N_in = N_filtered = n_display = 0;
3006  L = 0;
3007  kss_dssp = ksa_dssp = kss_pred = kss_conf = kfirst = -1;
3008
3009  /* @<re-set but keep memory@>
3010  do not free the memory but re-set content */
3011  longname[0] = '\0'; //delete[] longname; longname = NULL;
3012  keep[0] = '\0'; //delete[] keep; keep = NULL;
3013  display[0] = '\0'; //delete[] display; display = NULL;
3014  wg[0] = 0; //delete[] wg; wg = NULL;
3015  nseqs[0] = 0; //delete[] nseqs; nseqs = NULL;
3016  name[0]='\0';
3017  fam[0]='\0';
3018  file[0]='\0';
3019  //delete[] sname; sname = NULL;
3020  //delete[] seq; seq = NULL;
3021  //delete[] X; X = NULL;
3022  //delete[] I; I = NULL;
3023  //delete[] l; l = NULL;
3024
3025  /* @= */
3026  return;
3027 }
3028 #endif