1 /* -*- mode: c; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
3 /*********************************************************************
4 * Clustal Omega - Multiple sequence alignment
6 * Copyright (C) 2010 University College Dublin
8 * Clustal-Omega is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License as
10 * published by the Free Software Foundation; either version 2 of the
11 * License, or (at your option) any later version.
13 * This file is part of Clustal-Omega.
15 ********************************************************************/
18 * RCS $Id: hhfullalignment-C.h 243 2011-05-31 13:49:19Z fabian $
25 #include <iostream> // cin, cout, cerr
26 #include <fstream> // ofstream, ifstream
27 #include <stdio.h> // printf
28 #include <stdlib.h> // exit
29 #include <string> // strcmp, strstr
30 #include <math.h> // sqrt, pow
31 #include <limits.h> // INT_MIN
32 #include <float.h> // FLT_MIN
33 #include <time.h> // clock
34 #include <ctype.h> // islower, isdigit etc
41 #include "util-C.h" // imax, fmax, iround, iceil, ifloor, strint, strscn, strcut, substr, uprstr, uprchr, Basename etc.
42 #include "list.h" // list data structure
43 #include "hash.h" // hash data structure
44 #include "hhdecl-C.h" // constants, class
45 #include "hhutil-C.h" // imax, fmax, iround, iceil, ifloor, strint, strscn, strcut, substr, uprstr, uprchr, Basename etc.
46 #include "hhhmm.h" // class HMM
47 #include "hhalignment.h" // class Alignment
49 #include "hhhalfalignment.h"
54 //////////////////////////////////////////////////////////////////////////////
55 //////////////////////////////////////////////////////////////////////////////
56 // Methods of class FullAlignment
57 //////////////////////////////////////////////////////////////////////////////
58 //////////////////////////////////////////////////////////////////////////////
61 //////////////////////////////////////////////////////////////////////////////
62 // Output Results: use classes HalfAlignment and FullAlignment
65 // Each column list contains at least a match state
66 // Insert states between the match states are omitted
68 // step 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1
69 // i 0 0 1 2 3 4 5 6 7 8 9 9 9 9 10 11 12 13 14
71 // Q ~ ~ X X X X X X X X X ~ ~ ~ X X X X X
72 // T Y Y Y Y Y Y Y Y ~ Y Y Y Y Y Y Y Y ~ ~
74 // j 7 8 9 10 11 12 13 14 14 15 16 17 18 19 20 21 22 22 22
75 // state IM IM MM MM MM MM MM MM DG MM MM GD GD GD MM MM MM MI MI
81 //////////////////////////////////////////////////////////////////////////////
83 FullAlignment::FullAlignment(int maxseqdis)
85 qa = new HalfAlignment(maxseqdis);
86 ta = new HalfAlignment(maxseqdis);
87 for (int h=0; h<LINELEN-1; h++) symbol[h]=' ';
90 //////////////////////////////////////////////////////////////////////////////
92 FullAlignment::~FullAlignment()
98 //////////////////////////////////////////////////////////////////////////////
99 // Free memory of arrays s[][], l[][], and m[][] in HalfAlignment
100 void FullAlignment::FreeMemory()
106 //////////////////////////////////////////////////////////////////////////////
108 * @brief Add columns for match (and delete) states.
111 FullAlignment::AddColumns(int i, int j, char prev_state, char state, float S)
115 case MM: //MM pair state (both query and template in Match state)
116 AddGaps(); //fill up gaps until query and template parts have same length
117 symbol[qa->pos] =ScoreChr(S);
120 qa->AddInsertsAndFillUpGaps(i);
121 ta->AddInsertsAndFillUpGaps(j);
125 if (prev_state==DG) AddGaps();
127 ta->AddColumn(j); //query has gap -> add nothing
128 ta->AddInsertsAndFillUpGaps(j);
131 if (prev_state==MI) AddGaps();
133 ta->AddColumn(j); //query has gap -> add nothing
134 ta->AddInsertsAndFillUpGaps(j);
138 if (prev_state==GD) AddGaps();
140 qa->AddColumn(i);//template has gap -> add nothing
141 qa->AddInsertsAndFillUpGaps(i);
144 if (prev_state==IM) AddGaps();
146 qa->AddColumn(i);//template has gap -> add nothing
147 qa->AddInsertsAndFillUpGaps(i);
152 /////////////////////////////////////////////////////////////////////////////////////
154 * @brief Fill up gaps until query and template parts have same length
157 FullAlignment::AddGaps()
159 while (qa->pos<ta->pos) qa->AddChar('.');
160 while (ta->pos<qa->pos) ta->AddChar('.');
164 //////////////////////////////////////////////////////////////////////////////
166 * @brief Build full alignment -> qa->s[k][h] and ta->s[k][h]
169 FullAlignment::Build(HMM& q, Hit& hit)
172 char prev_state=MM, state=MM;
176 identities=0; // number of identical residues in query and template sequence
177 score_sim=0.0f;// substitution matrix similarity score between query and template
181 // Set up half-alignments
182 // n is the sequence index up to which sequences are prepared for display
183 n = imin( q.n_display,par.nseqdis+( q.nss_dssp>=0)+( q.nsa_dssp>=0)+( q.nss_pred>=0)+( q.nss_conf>=0)+( q.ncons>=0));
184 qa->Set( q.name, q.seq, q.sname, n, q.L, q.nss_dssp , q.nss_pred , q.nss_conf, q.nsa_dssp, q.ncons, hit.L/*<--FS*/);
185 n = imin(hit.n_display,par.nseqdis+(hit.nss_dssp>=0)+(hit.nsa_dssp>=0)+(hit.nss_pred>=0)+(hit.nss_conf>=0)+(hit.ncons>=0));
186 ta->Set(hit.name,hit.seq,hit.sname, n,hit.L,hit.nss_dssp,hit.nss_pred,hit.nss_conf,hit.nsa_dssp, hit.ncons, q.L/*<--FS*/);
189 // printf("HMM: %s\nstep nst i j state hq ht\n",hit.name);
192 if ((1 == hit.i1) && (1 == hit.j1)){
195 else if ((1 == hit.i1) && (1 != hit.j1)){
196 for (int j = 1; j < hit.j1; j++){
197 AddColumns(0, j, MM, IM, 0.0);
199 if (rLog.iLogLevelEnabled <= LOG_DEBUG){
201 printf("%d: j1=%d -> temp has leading gaps\n", __LINE__, hit.j1);
202 for (iK = 0; iK < ta->n; iK++){
203 //printf("T%d: %s\n", iK, ta->s[iK]);
204 for (iL = 0; iL < hit.j1; iL++){
205 ta->s[iK][iL] = tolower(ta->s[iK][iL]);
210 else if ((1 != hit.i1) && (1 == hit.j1)){
211 for (int i = 1; i < hit.i1; i++){
212 AddColumns(i, 0, MM, MI, 0.0);
214 if (rLog.iLogLevelEnabled <= LOG_DEBUG){
215 FILE *fp = rLog.prFP[LOG_DEBUG];
217 fprintf(fp, "%d: i1=%d -> query has leading gaps\n", __LINE__, hit.i1);
219 for (iK = 0; iK < qa->n; iK++){
220 //printf("Q%d: %s\n", iK, qa->s[iK]);
221 for (iL = 0; iL < hit.i1; iL++){
222 qa->s[iK][iL] = tolower(qa->s[iK][iL]);
229 "+-------------------------------+\n"
230 "| both sequences truncated left |\n"
231 "+-------------------------------+\n"
232 "i1 = %d, j1 = %d\n", hit.i1, hit.j1);
233 return FAILURE; /* FS, r241 -> r243 */
237 for (step=hit.nsteps; step>=1; step--)
240 state = hit.states[step];
242 // Add column to alignment and compute identities and sequence-sequence similarity score
243 AddColumns(hit.i[step],hit.j[step],prev_state,state,hit.S[step]);
246 char qc=qa->seq[ q.nfirst][ qa->m[ q.nfirst][hit.i[step]] ];
247 char tc=ta->seq[hit.nfirst][ ta->m[hit.nfirst][hit.j[step]] ];
248 if (qc==tc) identities++; // count identical amino acids
249 score_sim += S[(int)aa2i(qc)][(int)aa2i(tc)];
250 // fprintf(stderr,"%3i %3i %3i %3i %3i %1c %1c %6.2f %6.2f %6.2f %6.2f \n",step,hit.nsteps,hit.i[step],hit.j[step],int(state),qc,tc,S[(int)aa2i(qc)][(int)aa2i(tc)],score_sim,hit.P_posterior[step],hit.sum_of_probs); //DEBUG
255 if ((qa->L == hit.i2) && (ta->L == hit.j2)){
258 else if ((qa->L == hit.i2) && (ta->L != hit.j2)){
259 for (int j = hit.j2+1; j <= ta->L; j++){
260 AddColumns(0, j, MM, IM, 0.0);
262 if (rLog.iLogLevelEnabled <= LOG_DEBUG){
265 FILE *fp = rLog.prFP[LOG_DEBUG];
267 fprintf(fp, "%d: j2=%d (%d) -> temp has trailing gaps\n", __LINE__, hit.j2, ta->L);
268 for (iK = 0; iK < ta->n; iK++){
269 //printf("T%d: %s\n", iK, ta->s[iK]+strlen(ta->s[iK])-(ta->L-hit.j2));
270 for (uiL = strlen(ta->s[iK])-(ta->L-hit.j2); uiL < strlen(ta->s[iK]); uiL++){
271 ta->s[iK][uiL] = tolower(ta->s[iK][uiL]);
276 else if ((qa->L != hit.i2) && (ta->L == hit.j2)){
277 for (int i = hit.i2+1; i <= qa->L; i++){
278 AddColumns(i, 0, MM, MI, 0.0);
280 if (rLog.iLogLevelEnabled <= LOG_DEBUG){
283 printf("%d: i2=%d (%d)-> query has trailing gaps\n", __LINE__, hit.i2, qa->L);
284 for (iK = 0; iK < qa->n; iK++){
285 //printf("Q%d: %s\n", iK, qa->s[iK]+strlen(qa->s[iK])-(qa->L-hit.i2));
286 for (uiL = strlen(qa->s[iK])-(qa->L-hit.i2); uiL < strlen(qa->s[iK]); uiL++){
287 qa->s[iK][uiL] = tolower(qa->s[iK][uiL]);
294 "+--------------------------------+\n"
295 "| both sequences truncated right |\n"
296 "+--------------------------------+\n"
297 "i2 = %d != %d = qa->L, j2 = %d != %d = ta->L\n",
298 hit.i2, qa->L, hit.j2, ta->L);
299 return FAILURE; /* FS, r241 -> r243 */
304 AddGaps(); //fill up gaps until query and template parts have same length
308 // Change gap symbol '.' (gap aligned to insert) to '~' if one HMM has gap with respect to other HMM
309 for (hh=1; hh<qa->pos; hh++)
313 // Gap in query (IM or GD state)
315 for (k=0; k<qa->n; k++) if (qa->s[k][hh]=='.') qa->s[k][hh]='-';
317 else if (symbol[hh]=='T')
319 // Gap in target (MI or DG state)
321 for (k=0; k<ta->n; k++) if (ta->s[k][hh]=='.') ta->s[k][hh]='-';
325 } /* this is the end of FullAlignment::Build() */
328 //////////////////////////////////////////////////////////////////////////////
330 * @brief Print out header before full alignment
333 FullAlignment::PrintHeader(FILE* outf, HMM& q, Hit& hit)
335 fprintf(outf,">%s\n",hit.longname);
336 fprintf(outf,"Probab=%-.2f E-value=%-.2g Score=%-.2f Aligned_cols=%i Identities=%i%% Similarity=%-.3f Sum_probs=%.1f\n\n",
337 hit.Probab,hit.Eval,hit.score,hit.matched_cols,iround(100.0*identities/hit.matched_cols),score_sim/hit.matched_cols,hit.sum_of_probs);
340 //////////////////////////////////////////////////////////////////////////////
342 * @brief Print out full alignment in HHR format
345 FullAlignment::PrintHHR(FILE* outf, Hit& hit)
347 const int NLEN=14; //Length of name field in front of multiple alignment
348 int h=0; //counts position (column) in alignment
349 int hh=0; //points to column at start of present output block of alignment
350 int k; //counts sequences in query and template
351 //short unsigned int lq[MAXSEQ]; // lq[k] counts index of residue from query sequence k to be printed next;
352 short unsigned int *lq = NULL; // lq[k] counts index of residue from query sequence k to be printed next;
353 //short unsigned int lt[MAXSEQ]; // lt[k] counts index of residue from template sequence k to be printed next;
354 short unsigned int *lt = NULL; // lt[k] counts index of residue from template sequence k to be printed next;
355 char namestr[NAMELEN]; //name of sequence
356 int iq=hit.i1; // match state counter for query HMM (displayed in consensus line)
357 int jt=hit.j1; // match state counter for template HMM (displayed in consensus line)
359 lq = new(short unsigned int[qa->n+2]);
360 lt = new(short unsigned int[ta->n+2]);
362 for (k=0; k<qa->n; k++) lq[k]=qa->l[k][hit.i1];
363 for (k=0; k<ta->n; k++) lt[k]=ta->l[k][hit.j1];
365 while (hh<ta->pos-1) // print alignment block
367 // Print query secondary structure sequences
368 for (k=0; k<qa->n; k++)
370 if (k==qa->nsa_dssp) continue;
371 if (!(k==qa->nss_dssp || k==qa->nsa_dssp || k==qa->nss_pred || k==qa->nss_conf)) continue;
372 if (k==qa->nss_dssp && !par.showdssp) continue;
373 if ((k==qa->nss_pred || k==qa->nss_conf) && !par.showpred) continue;
374 strncpy(namestr,qa->sname[k],NAMELEN-2);
375 namestr[NAMELEN-1]='\0';
377 fprintf(outf,"Q %-*.*s ",NLEN,NLEN,namestr);
378 for (h=hh; h<imin(hh+par.aliwidth,qa->pos-1); h++) fprintf(outf,"%1c",qa->s[k][h]);
382 // Print query sequences
383 for (k=0; k<qa->n; k++)
385 if (k==qa->nss_dssp || k==qa->nsa_dssp || k==qa->nss_pred || k==qa->nss_conf || k==qa->ncons) continue;
386 strncpy(namestr,qa->sname[k],NAMELEN-2);
387 namestr[NAMELEN-1]='\0';
389 fprintf(outf,"Q %-*.*s %4i ",NLEN,NLEN,namestr,lq[k]);
390 for (h=hh; h<imin(hh+par.aliwidth,qa->pos-1); h++)
391 {fprintf(outf,"%1c",qa->s[k][h]); lq[k]+=WordChr(qa->s[k][h]);} //WordChr() returns 1 if a-z or A-Z; 0 otherwise
392 fprintf(outf," %4i (%i)\n",lq[k]-1,qa->l[k][qa->L+1]);
395 // Print query consensus sequence
396 if (par.showcons && qa->ncons>=0)
399 strncpy(namestr,qa->sname[k],NAMELEN-2);
400 namestr[NAMELEN-1]='\0';
402 fprintf(outf,"Q %-*.*s %4i ",NLEN,NLEN,namestr,iq);
403 for (h=hh; h<imin(hh+par.aliwidth,qa->pos-1); h++)
405 if (qa->s[k][h]=='x') qa->s[k][h]='~';
406 if (qa->s[k][h]!='-' && qa->s[k][h]!='.') iq++;
407 fprintf(outf,"%1c",qa->s[k][h]);
409 fprintf(outf," %4i (%i)\n",iq-1,qa->L);
413 // Print symbols representing the score
414 fprintf(outf," %*.*s ",NLEN,NLEN," ");
415 for (h=hh; h<imin(hh+par.aliwidth,qa->pos-1); h++) fprintf(outf,"%1c",symbol[h]);
418 // Print template consensus sequence
419 if (par.showcons && ta->ncons>=0)
422 strncpy(namestr,ta->sname[k],NAMELEN-2);
423 namestr[NAMELEN-1]='\0';
425 fprintf(outf,"T %-*.*s %4i ",NLEN,NLEN,namestr,jt);
426 for (h=hh; h<imin(hh+par.aliwidth,ta->pos-1); h++)
428 if (ta->s[k][h]=='x') ta->s[k][h]='~';
429 if (ta->s[k][h]!='-' && ta->s[k][h]!='.') jt++;
430 fprintf(outf,"%1c",ta->s[k][h]);
432 fprintf(outf," %4i (%i)\n",jt-1,ta->L);
434 // Print template sequences
435 for (k=0; k<ta->n; k++)
437 if (k==ta->nss_dssp || k==ta->nsa_dssp || k==ta->nss_pred || k==ta->nss_conf || k==ta->ncons) continue;
438 strncpy(namestr,ta->sname[k],NAMELEN-2);
439 namestr[NAMELEN-1]='\0';
441 fprintf(outf,"T %-*.*s %4i ",NLEN,NLEN,namestr,lt[k]);
442 for (h=hh; h<imin(hh+par.aliwidth,ta->pos-1); h++)
443 {fprintf(outf,"%1c",ta->s[k][h]); lt[k]+=WordChr(ta->s[k][h]);} //WordChr() returns 1 if a-z or A-Z; 0 otherwise
444 fprintf(outf," %4i (%i)\n",lt[k]-1,ta->l[k][ta->L+1]);
447 // Print template secondary structure sequences
448 for (k=0; k<ta->n; k++)
450 if (k==ta->nsa_dssp) continue;
451 if (!(k==ta->nss_dssp || k==ta->nss_pred || k==ta->nss_conf)) continue;
452 if (k==ta->nss_dssp && !par.showdssp) continue;
453 if ((k==ta->nss_pred || k==ta->nss_conf)&& !par.showpred) continue;
454 strncpy(namestr,ta->sname[k],NAMELEN-2);
455 namestr[NAMELEN-1]='\0';
457 fprintf(outf,"T %-*.*s ",NLEN,NLEN,namestr);
458 for (h=hh; h<imin(hh+par.aliwidth,ta->pos-1); h++) fprintf(outf,"%1c",ta->s[k][h]);
463 fprintf(outf,"\n\n");
466 delete[](lq); lq = NULL;
467 delete[](lt); lt = NULL;
469 } /* this is the rnd of PrintHHR() */
471 //////////////////////////////////////////////////////////////////////////////
472 // Print out full alignment in A2M format
473 //////////////////////////////////////////////////////////////////////////////
474 #define TELOMER_PRINT 0
475 void FullAlignment::PrintA2M(FILE* outf, Hit& hit)
477 int k; //counts sequences in query and template
480 int iTelomerLeft = hit.i1 - hit.j1;
481 int iTelomerRght = (qa->L - hit.i2) - (ta->L - hit.j2);
484 // Print query sequences
485 for (k=0; k<qa->n; k++)
487 if (k==qa->nsa_dssp) continue;
488 if (k==qa->nss_dssp && !par.showdssp) continue;
489 if ((k==qa->nss_pred || k==qa->nss_conf) && !par.showpred) continue;
490 if (k==qa->ncons && !par.showcons) continue;
491 fprintf(outf,">%s\n",qa->sname[k]);
493 /* @<print left cut-off@> */
494 if (iTelomerLeft > 0){
495 for (int iI = 1; iI <= iTelomerLeft; iI++){
496 fprintf(outf, "%1c", qa->seq[k][iI]);
498 /* this may be unnecessary */
499 for (int iI = iTelomerLeft+1; iI < hit.i1; iI++){
500 fprintf(outf, "%1c", qa->seq[k][iI]);
504 for (int iI = iTelomerLeft; iI < 0; iI++){
505 fprintf(outf, "%1c", '-');
507 /* this may be unnecessary */
510 /* @<print consensus@> */
511 for (h = 0; qa->s[k][h] > 0; h++){
512 fprintf(outf, "%1c", qa->s[k][h]);
514 /* @<print out rght cut-off@> */
515 for (int iI = hit.i2+1; iI <= qa->L; iI++){
516 fprintf(outf, "%1c", qa->seq[k][iI]);
518 for (int iI = 0; iI > iTelomerRght; iI--){
519 fprintf(outf, "%1c", '-');
522 for (h=0,hh=-par.aliwidth; qa->s[k][h]>0; h++,hh++)
524 if (!hh) {fprintf(outf,"\n"); hh-=par.aliwidth;}
525 fprintf(outf,"%1c",qa->s[k][h]);
531 // Print template sequences
532 for (k=0; k<ta->n; k++)
534 if (k==ta->nsa_dssp) continue;
535 if (k==ta->nss_dssp && !par.showdssp) continue;
536 if ((k==ta->nss_pred || k==ta->nss_conf) && !par.showpred) continue;
537 if (k==ta->ncons && !par.showcons) continue;
538 fprintf(outf,">%s\n",ta->sname[k]);
540 /* @<print left cut-off@> */
541 if (iTelomerLeft > 0){
542 for (int iI = 1; iI <= iTelomerLeft; iI++){
543 fprintf(outf, "%1c", '-');
545 /* this may be unnecessary */
546 for (int iI = iTelomerLeft+1; iI < hit.j1; iI++){
547 fprintf(outf, "%1c", ta->seq[k][iI]);
551 for (int iI = 1; iI <= -iTelomerLeft; iI++){
552 fprintf(outf, "%1c", ta->seq[k][iI]);
554 /* this may be unnecessary */
557 /* @<print consensus@> */
558 for (h = 0; ta->s[k][h] > 0; h++){
559 fprintf(outf, "%1c", ta->s[k][h]);
562 /* @<print out rght cut-off@> */
563 for (int iI = hit.j2+1; iI <= ta->L; iI++){
564 fprintf(outf, "%1c", ta->seq[k][iI]);
566 for (int iI = 0; iI < iTelomerRght; iI++){
567 fprintf(outf, "%1c", '-');
571 for (h=0,hh=-par.aliwidth; ta->s[k][h]>0; h++,hh++)
573 if (!hh) {fprintf(outf,"\n"); hh-=par.aliwidth;}
574 fprintf(outf,"%1c",ta->s[k][h]);
581 } /* this is the end of PrintA2M() */
583 ////////////////////////////////////////////////////////////////////////////
585 * @brief Print out full alignment in A2M format
588 FullAlignment::PrintFASTA(FILE* outf, Hit& hit)
590 // Transform sequences to uppercase and '.' to '-'
596 //////////////////////////////////////////////////////////////////////////////
598 * @brief Print out full alignment in A2M format
601 FullAlignment::PrintA3M(FILE* outf, Hit& hit)
603 // Remove all '.' from sequences
604 qa->RemoveChars('.');
605 ta->RemoveChars('.');
614 FullAlignment::OverWriteSeqs(char **ppcFirstProf, char **ppcSecndProf){
619 for (iS = 0; iS < qa->n; iS++){
620 for (iR = 0; iR < qa->pos; iR++){
621 cRes = qa->s[iS][iR];
622 ppcFirstProf[iS][iR] = ('.' == cRes) ? '-' : cRes;
624 } /* 0 <= iR < qa.L */
625 ppcFirstProf[iS][iR] = '\0';
626 } /* 0 <= iS < qa.n */
628 for (iS = 0; iS < ta->n; iS++){
629 for (iR = 0; iR < ta->pos; iR++){
630 cRes = ta->s[iS][iR];
631 ppcSecndProf[iS][iR] = ('.' == cRes) ? '-' : cRes;
633 } /* 0 <= iR < ta.L */
634 ppcSecndProf[iS][iR] = '\0';
635 } /* 0 <= iS < ta.n */
637 } /* this is the end of OverWriteSeqs() */