/* -*- mode: c; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */ /********************************************************************* * Clustal Omega - Multiple sequence alignment * * Copyright (C) 2010 University College Dublin * * Clustal-Omega is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This file is part of Clustal-Omega. * ********************************************************************/ /* * RCS $Id: hhalignment-C.h 316 2016-12-16 16:14:39Z fabian $ */ /* * Changelog: Michael Remmert made changes to hhalign stand-alone code * FS implemented some of the changes on 2010-10-28 -> MR1 * * Note: MR seems to have changed all [aijk]++ to ++[aijk], * FS did not do that on 2010-10-28 */ // hhalignment.C ////////////////////////////////////////////////////////////////////////////// //// Class Alignment ////////////////////////////////////////////////////////////////////////////// // hhalignment.C #ifndef MAIN #define MAIN #include // cin, cout, cerr #include // ofstream, ifstream #include // printf using std::cout; using std::cerr; using std::endl; using std::ios; using std::ifstream; using std::ofstream; #include // exit #include // strcmp, strstr #include // sqrt, pow #include // INT_MIN #include // FLT_MIN #include // clock #include // islower, isdigit etc #include "util-C.h" // imax, fmax, iround, iceil, ifloor, strint, strscn, strcut, substr, uprstr, uprchr, Basename etc. #include "list.h" // list data structure #include "hash.h" // hash data structure #include "hhdecl-C.h" #include "hhutil-C.h" // imax, fmax, iround, iceil, ifloor, strint, strscn, strcut, substr, uprstr, uprchr, Basename etc. #include "hhhmm.h" #endif #ifdef HAVE_CONFIG_H #include "config.h" #endif //#include "new_new.h" /* memory tracking */ enum {KEEP_NOT = 0, KEEP_CONDITIONALLY, KEEP_ALWAYS}; ////////////////////////////////////////////////////////////////////////////// // Class Alignment ////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////// // Object constructor ////////////////////////////////////////////////////////////////////////////// Alignment::Alignment(int maxseq, int maxres) { //printf(">>>>>>>>%s:%s:%d: maxseq=%d, maxres=%d\n", __FUNCTION__, __FILE__, __LINE__, maxseq, maxres); /* (FS) */ longname = new(char[DESCLEN]); sname = new char*[maxseq+2]; /* MR1 */ seq = new char*[maxseq+2]; /* MR1 */ l = new int[maxres]; X = new char*[maxseq+2]; /* MR1 */ I = new short unsigned int*[maxseq+2]; /* MR1 */ keep = new char[maxseq+2]; /* MR1 */ display = new char[maxseq+2]; /* MR1 */ wg = new float[maxseq+2]; /* MR1 */ nseqs = new int[maxres+2]; /* MR1 */ N_in=L=0; nres=NULL; // number of residues per sequence k first=NULL; // first residue in sequence k last=NULL; // last residue in sequence k ksort=NULL; // sequence indices sorted by descending nres[k] name[0]='\0'; // no name defined yet longname[0]='\0'; // no name defined yet fam[0]='\0'; // no name defined yet file[0]='\0'; // no name defined yet readCommentLine = '0'; /* MR1 */ } ////////////////////////////////////////////////////////////////////////////// // Object destructor ////////////////////////////////////////////////////////////////////////////// Alignment::~Alignment() { delete[] longname; longname = NULL; for(int k=0; k') //line contains sequence name { if (k>=MAXSEQ-1) { if (v>=1 && k>=MAXSEQ) cerr<=0) //if this is at least the second name line { if (strlen(cur_seq)==0) { cerr<ss_dssp",8)) { if (kss_dssp<0) {display[k]=2; n_display++; keep[k]=KEEP_NOT; kss_dssp=k; N_ss++;} else {skip_sequence=1; k--; continue;} } else if (!strncmp(line,">sa_dssp",8)) { if (ksa_dssp<0) {display[k]=KEEP_ALWAYS; n_display++; keep[k]=KEEP_NOT; ksa_dssp=k; N_ss++;} else {skip_sequence=1; k--; continue;} } else if (!strncmp(line,">ss_pred",8)) { if (kss_pred<0) {display[k]=KEEP_ALWAYS; n_display++; keep[k]=KEEP_NOT; kss_pred=k; N_ss++;} else {skip_sequence=1; k--; continue;} } else if (!strncmp(line,">ss_conf",8)) { if (kss_conf<0) {display[k]=KEEP_ALWAYS; n_display++; keep[k]=KEEP_NOT; kss_conf=k; N_ss++;} else {skip_sequence=1; k--; continue;} } else if (!strncmp(line,">ss_",4) || !strncmp(line,">sa_",4)) { display[k]=KEEP_ALWAYS; n_display++; keep[k]=KEEP_NOT; N_ss++; } else if (!strncmp(line,">aa_",4)) { // ignore sequences beginning with ">aa_" skip_sequence=1; k--; continue; } //store first real seq else if (kfirst<0) { char word[NAMELEN]; strwrd(word,line); // Copies first word in ptr to str if (strstr(word,"_consensus")) {display[k]=2; keep[k]=0; n_display++; kfirst=k;} /* MR1 */ else {display[k]=keep[k]=KEEP_ALWAYS; n_display++; kfirst=k;} } //store all sequences else if (par.mark==0) {display[k]=keep[k]=KEEP_CONDITIONALLY; n_display++;} //store sequences up to nseqdis else if (line[1]=='@'&& n_display-N_ss=4) printf("Reading seq %-16.16s k=%3i n_displ=%3i display[k]=%i keep[k]=%i\n",cur_name,k,n_display,display[k],keep[k]); sname[k] = new char[strlen(cur_name)+1]; if (!sname[k]) {MemoryError("array for sequence names");} strcpy(sname[k],cur_name); } // end if(line contains sequence name) else if (line[0]=='#') // Commentary line? { // #PF01367.9 5_3_exonuc: 5'-3' exonuclease, C-terminal SAM fold; PDB 1taq, 1bgx (T:271-174), 1taq (271-174) if (name[0]) continue; // if already name defined: skip commentary line char *ptr1, *ptr2; ptr1=strscn_(line+1); // set ptr1 to first non-whitespace character after '#' -> AC number strncpy(longname,ptr1,DESCLEN-1); // copy whole commentary line after '# ' into longname longname[DESCLEN-1]='\0'; strtr(longname,""," "); ptr2=strcut_(ptr1); // cut after AC number and set ptr2 to first non-whitespace character after AC number // strcpy(fam,ptr1); // copy AC number to fam // if (!strncmp(fam,"PF",2)) strcut_(fam,'.'); // if PFAM identifier contains '.' cut it off // strcut_(ptr2); // cut after first word ... strcpy(name,ptr1); // ... and copy first word into name readCommentLine = '1'; /* MR1 */ } //line contains sequence residues or SS information and does not belong to a >aa_ sequence else if (!skip_sequence) { if (v>=4) cout<'\0' && l=0) // ignore white-space characters ' ', \t and \n (aa2i()==-1) {cur_seq[l]=line[h]; l++;} else if (aa2i(line[h])==-2 && v) cerr<'\0' && l=0 && ss2i(line[h])<=7) {cur_seq[l]=ss2ss(line[h]); l++;} else if (v) cerr<'\0' && l=0) cur_seq[l++]=line[h]; else if (v) cerr<'\0' && l=0 && ss2i(line[h])<=3) {cur_seq[l]=ss2ss(line[h]); l++;} else if (v) cerr<'\0' && l='0' && line[h]<='9')) {cur_seq[l]=line[h]; l++;} else if (v) cerr<sa_pred etc { while (h'\0' && l='0' && line[h]<='9') || (line[h]>='A' && line[h]<='B')) {cur_seq[l]=line[h]; l++;} else if (v) cerr<=/*MAXCOL*/par.maxColCnt-1) { cerr<=0) //if at least one sequence was read in { seq[k]=new char[strlen(cur_seq)+2]; if (!seq[k]) MemoryError("array for input sequences"); X[k]=new char[strlen(cur_seq)+2]; if (!X[k]) MemoryError("array for input sequences"); I[k]=new short unsigned int[strlen(cur_seq)+2]; if (!I[k]) MemoryError("array for input sequences"); strcpy(seq[k],cur_seq); } else {cerr< extract from first sequence { char* ptr; // strtr(sname[kfirst],"~"," "); // 'transpose': replaces the tilde with a blanc everywhere in sname[kfirst] strncpy(longname,sname[kfirst],DESCLEN-1); // longname is name of first sequence longname[DESCLEN-1]='\0'; strncpy(name,sname[kfirst],NAMELEN-1); // Shortname is first word of longname... name[NAMELEN-1]='\0'; ptr = strcut(name); // ...until first white-space character if (ptr && islower(ptr[0]) && ptr[1]=='.' && isdigit(ptr[2])) //Scop family code present as second word? { lwrstr(name); // Transform upper case to lower case strcut(ptr); // Non-white-space characters until next white-space character.. strcpy(fam,ptr); // ...are the SCOP familiy code } else if (name[0]=='P' && name[1]=='F' && isdigit(name[2]) && isdigit(name[3]) ) //Pfam code { strcpy(fam,name); // set family name = Pfam code } } delete[] cur_seq; cur_seq = NULL; // Checking for warning messages if (v==0) return; if (v>=2) cout<<"Read "<=3) cout<<"Query sequence for alignment has number "< WARNING */ /* points to next character in seq[k] to be written */ /*static short unsigned int h[MAXSEQ];*/ /*short*/ unsigned int *h = NULL; /* short may lead to overflow for long alignments, FS, r235 -> r236 */ h = new /*short*/ unsigned int[N_in+2]; /* short -> overflow, FS, r235 -> r236 */ float *percent_gaps = NULL; /* FS, 2010-Nov */ char *match_state = NULL; /* FS, 2010-Nov */ // Initialize for (k=0;k=3) { if (par.M==1) cout<<"Using match state assignment by capital letters (a2m format)\n"; else if (par.M==2) cout<<"Using percentage-rule match state assignment\n"; else if (par.M==3) cout<<"Using residues of first sequence as match states\n"; } // Create matrices X and I with amino acids represented by integer numbers switch(par.M) { ///////////////////////////////////////////////////////////////////////// /* a2m/a3m format: match states capital case, inserts lower case, delete states '-', inserted gaps '.' The confidence values for ss prediction are interpreted as follows: 0-9:match states(!) '-' :match state '.':insert */ case 1: default: // Warn if alignment is ment to be -M first or -M NN instead of A2M/A3M if (v>=2 && strchr(seq[kfirst],'-') ) // Seed/query sequence contains a gap ... { for (k=1; kss_dssp, >ss_pred, >ss_conf, >aa_... sequences { while((c=seq[k][l++])) // assign residue to c at same time { if (c>='a' && c<='z') I[k][i-1]++;//insert state = lower case character else if (c!='.') //match state = upper case character { X[k][i]=aa2i(c); I[k][i]=0; i++; } } } else if (k==kss_dssp || k==kss_pred) // does alignment contain sequence of secondary structure states? { while((c=seq[k][l++])) // assign residue to c at same time if (c!='.' && !(c>='a' && c<='z')) X[k][i++]=ss2i(c); //match state = upper case character } else if (k==ksa_dssp) // does alignment contain sequence of prediction confidence values? { while((c=seq[k][l++])) // assign residue to c at same time if (c!='.' && !(c>='a' && c<='z')) X[k][i++]=sa2i(c); //match state = upper case character } else if (k==kss_conf) // does alignment contain sequence of prediction confidence values? { while((c=seq[k][l++])) // assign residue to c at same time if (c!='.') X[k][i++]=cf2i(c); //match state = 0-9 or '-' } else if (k==kfirst) // does alignment contain sequence of prediction confidence values? { while((c=seq[k][l++])) // assign residue to c at same time if (c!='.') { X[k][i]=aa2i(c); I[k][i]=0; ++i; } } else continue; i--; if (L!=i && L!=/*MAXRES*/par.maxResLen-2 && !unequal_lengths) unequal_lengths=k; //sequences have different lengths L=imin(L,i); } // end for (k) if (unequal_lengths) break; //Replace GAP with ENDGAP for all end gaps /* MR1 */ for (k=0; k=1 && X[k][i]==GAP; i--) X[k][i]=ENDGAP; /* MR1 */ } for (i=1; i<=L; i++) this->l[i]=i; //assign column indices to match states if (L<=0) { cout<<"\nError: Alignment in "< option"<=2) { printf("WARNING: Number of match columns too large. Only first %i match columns will be kept!\n",L); break; } if (v>=2) cout<<"Alignment in "<=1 && X[k][i]==GAP; i--) X[k][i]=ENDGAP; /* MR1 */ } // Add up percentage of gaps #if 0 //#ifdef HAVE_OPENMP #pragma omp parallel for schedule(static) private(k) #endif for (l=1; l<=L; l++) { float res=0; float gap=0; for (k=0; k< N_in; k++){ if (keep[k]){ if ( X[k][l]=4) cout<<"percent gaps["<=100, make slightly bigger than 100% -- to be sure to be sure */ #define DEFAULT_MGAPS 100.0 /* Soeding's default is 50, omega default prior to telomere logic was 100 FIXME: this used to be par.Mgaps, in a later version re-introduce par.Mgaps to keep this value flexible */ #define TELOMER_LENGTH 10 /* this parameter must be > 0 (unless DEFAULT_MGAPS=100), if it is too big (L/2) then telomere logic has no effect, don't think it should be changed (much) */ #define TELOMER_FRACTION 0.10 //#define HMM_MIN_LENGTH 0.923 #define HMM_MIN_LENGTH 0.950 #define FORTRAN_OFFSET 1 double dDefaultMgaps; dDefaultMgaps = DEFAULT_MGAPS; #if TELOMERE_LOGIC /* turn telomere logic on (1) or off (0) */ int iTelomereLength; #if TELOMERE_DYNAMIC /* keep telomere length 'dynamic' */ iTelomereLength = TELOMER_LENGTH > (int)(L*TELOMER_FRACTION) ? TELOMER_LENGTH : (int)(L*TELOMER_FRACTION); #else iTelomereLength = TELOMER_LENGTH; #endif /* this was dynamic telomere */ #endif /* this was telomere logic */ /* if HMMs get too small (much smaller than profile length L) then one is liable to get a back-tracking error. So we should ensure that the DEFAULT_MGAPS parameter does NOT shrink the HMM too much. take percentage-gap vector, sort it, and fix dDefaultMgaps, such that at least (HMM_MIN_LENGTH)*(L) are left */ #if MGAP_LOGIC /* try to adapt Mgaps to size of final HMM */ { float *pfPercentGaps = NULL; if (NULL == (pfPercentGaps = (float *)malloc((L+1)*sizeof(float)))){ printf("%s:%s:%d: could not malloc %d float for sorted percent-gaps\n", __FUNCTION__, __FILE__, __LINE__, L+1); dDefaultMgaps = DEFAULT_MGAPS; } else { for (l = 0; l < L; l++) { pfPercentGaps[l] = percent_gaps[l+FORTRAN_OFFSET]; } qsort(pfPercentGaps, L, sizeof(float), CompFltAsc); dDefaultMgaps = pfPercentGaps[(int)(HMM_MIN_LENGTH*L)]; if (dDefaultMgaps < DEFAULT_MGAPS){ //printf("Mgaps = %f <- %f\n", DEFAULT_MGAPS, dDefaultMgaps); dDefaultMgaps = DEFAULT_MGAPS; } else { //printf("Mgaps = %f\n", dDefaultMgaps); } free(pfPercentGaps); pfPercentGaps = NULL; } } #endif /* tried to adapt Mgaps to size of final HMM */ // Throw out insert states and keep only match states i=0; for (k=0; k=/*MAXRES*/par.maxResLen-2) { if (v>=1) printf("WARNING: Number of match columns too large. Only first %i match columns will be kept!\n",i); break; } i++; this->l[i]=l; #if 0 //#ifdef HAVE_OPENMP #pragma omp parallel for schedule(static) #endif for (k=0; k=2) cout<<"Alignment in "<=3) printf("Length of first seq = %i\n",L); // Check for sequences with unequal lengths for (k=1; k match state for (l=1; l<=L; l++) if (isalpha(seq[kfirst][l])) match_state[l]=1; else match_state[l]=0; // Throw out insert states and keep only match states for (k=0; k=/*MAXRES*/par.maxResLen-2) { if (v>=1) printf("WARNING: Number of match columns too large. Only first %i match columns will be kept!\n",i); break; } i++; this->l[i]=l; for (k=0; k=1 && X[k][i]==GAP; i--) X[k][i]=ENDGAP; /* MR1 */ } if (v>=2) cout<<"Alignment in "<=2 && !par.cons) { for (i=1; i<=L; i++) if (X[kfirst][i]==GAP) { printf("NOTE: Use the '-cons' option to calculate a consensus sequence as first sequence of the alignment.\n"); break; } } /* MR1 //Replace GAP with ENDGAP for all end gaps for (k=0; k=1 && X[k][i]==GAP; i--) X[k][i]=ENDGAP; }*/ // DEBUG if (v>=4) for (k=0; k"<9) cout<<"X"; else cout<