1 /* -*- mode: c; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
3 /*********************************************************************
4 * Clustal Omega - Multiple sequence alignment
6 * Copyright (C) 2010 University College Dublin
8 * Clustal-Omega is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License as
10 * published by the Free Software Foundation; either version 2 of the
11 * License, or (at your option) any later version.
13 * This file is part of Clustal-Omega.
15 ********************************************************************/
18 * RCS $Id: ktuple_pair.c 230 2011-04-09 15:37:50Z andreas $
21 * K-Tuple code for pairwise alignment (Wilbur and Lipman, 1983; PMID
22 * 6572363). Most code taken from showpair.c (Clustal 1.83)
23 * DD: some functions now have lots of parameters as static variables
24 * were removed to make code OpenMP-friendly
44 #include "squid/squid.h"
46 #include "symmatrix.h"
47 #include "ktuple_pair.h"
51 #define END_MARK -3 /* see interface.c in 1.83 */
52 #define NUMRES 32 /* max size of comparison matrix */
55 #undef SORT_LAST_ELEMENT_AS_WELL
57 /* gap_pos1 = NUMRES-2; /@ code for gaps inserted by clustalw @/ */
58 static const int GAP_POS2 = NUMRES-1; /* code for gaps already in alignment */
59 static bool DNAFLAG = FALSE;
61 static const char *AMINO_ACID_CODES = "ABCDEFGHIKLMNPQRSTUVWXYZ-";
62 static const char *NUCLEIC_ACID_CODES = "ACGTUN-";
63 /* As far as I understand the gap symbol should not be necessary here,
64 * because we use isgap for testing later anyway. But changing this,
65 * will affect max_res_code and max_nuc as well. So I leave it for now
69 static bool percent = TRUE;
71 static void make_ptrs(int *tptr, int *pl, const int naseq, const int l, const int ktup, const int max_res_code, char **seq_array);
72 static void put_frag(const int fs, const int v1, const int v2, const int flen, const int curr_frag, int *next, int *maxsf, int **accum);
73 static bool frag_rel_pos(int a1, int b1, int a2, int b2, int ktup);
74 static void des_quick_sort(int *array1, int *array2, const int array_size);
75 static void pair_align(int seq_no, int l1, int l2, int max_res_code, ktuple_param_t *aln_param,
76 char **seq_array, int *maxsf, int **accum, int max_aln_length,
77 int *zza, int *zzb, int *zzc, int *zzd);
78 static void encode(char *seq, char *naseq, int l, const char *res_codes);
79 static int res_index(const char *lookup, char c);
89 /* default ktuple pairwise alignment parameters
94 /* designated initializer */
95 const ktuple_param_t default_protein_param = {
103 /* designated initializer */
104 const ktuple_param_t default_dna_param = {
113 * note: naseq should be unit-offset
116 encode(char *seq, char *naseq, int l, const char *res_codes)
118 /* code seq as ints .. use GAP_POS2 for gap */
120 bool seq_contains_unknown_char = FALSE;
121 /*LOG_DEBUG("seq=%s naseq=%p l=%d", &(seq[1]), naseq, l); */
124 for (i=1; i<=l; i++) {
125 char res = toupper(seq[i]);
127 naseq[i] = GAP_POS2; /* gap in input */
129 naseq[i] = res_index(res_codes, res);
132 /*LOG_DEBUG("Character '%c' at pos %d", res, i);*/
133 if (-1 == naseq[i]) {
134 seq_contains_unknown_char = TRUE;
135 /*LOG_DEBUG("Unknown character '%c' at pos %d", res, i);*/
137 /*LOG_DEBUG("na_seq[%d]=%d", i, naseq[i]);*/
140 if (TRUE == seq_contains_unknown_char)
141 Log(&rLog, LOG_WARN, "Unknown character in seq '%s'", &(seq[1]));
154 res_index(const char *t, char c)
157 for (i=0; t[i] && t[i] != c; i++)
165 /* end of res_index */
172 make_ptrs(int *tptr, int *pl, const int naseq, const int l, const int ktup, const int max_res_code, char **seq_array)
174 /* FIXME make 10 a constant and give it a nice name */
176 int i, j, code, flag;
178 const int limit = (int) pow((double)(max_res_code+1),(double)ktup);
180 for (i=1;i<=ktup;i++)
181 a[i] = (int) pow((double)(max_res_code+1),(double)(i-1));
183 for (i=1; i<=limit; ++i)
188 for (i=1; i<=(l-ktup+1); ++i) {
191 for (j=1; j<=ktup; ++j) {
192 /* Log(&rLog, LOG_FORCED_DEBUG, "naseq=%d i=%d j=%d seq_array[naseq]=%p",
193 * naseq, i, j, seq_array[naseq]);
195 residue = seq_array[naseq][i+j-1];
196 /* Log(&rLog, LOG_FORCED_DEBUG, "residue = %d", residue); */
197 if ((residue<0) || (residue > max_res_code)){
201 code += ((residue) * a[j]);
213 /* end of make_ptrs */
218 * FIXME Why hardcoding of 5?
221 put_frag(const int fs, const int v1, const int v2, const int flen, const int curr_frag, int *next, int *maxsf, int **accum)
224 accum[0][curr_frag]=fs;
225 accum[1][curr_frag]=v1;
226 accum[2][curr_frag]=v2;
227 accum[3][curr_frag]=flen;
231 accum[4][curr_frag]=0;
235 if (fs >= accum[0][*maxsf]) {
236 accum[4][curr_frag]=*maxsf;
243 *next=accum[4][*next];
244 if (fs>=accum[0][*next])
247 accum[4][curr_frag]=*next;
248 accum[4][end]=curr_frag;
253 /* end of put_frag */
260 frag_rel_pos(int a1, int b1, int a2, int b2, int ktup)
262 if (a1-b1 == a2-b2) {
267 if (a2+ktup-1<a1 && b2+ktup-1<b1) {
273 /* end of frag_rel_pos */
280 * @note: This is together with des_quick_sort most time consuming
281 * routine according to gprof on r110. Tried to replace it with qsort
282 * and/or QSortAndTrackIndex(), which is always slower! So we keep the
285 * Original doc: Quicksort routine, adapted from chapter 4, page 115
286 * of software tools by Kernighan and Plauger, (1986). Sort the
287 * elements of array1 and sort the elements of array2 accordingly
289 * There might be a bug here. The original function apparently never
290 * touches the last element and keeps it as is. Tried to fix this (see
291 * SORT_LAST_ELEMENT_AS_WELL) which gives slightly worse performance
292 * (-0.5% on BB). My fix might not be working or it's not a bug at
299 des_quick_sort(int *array1, int *array2, const int array_size)
304 int lst[50], ust[50]; /* the maximum no. of elements must be*/
305 /* < log(base2) of 50 */
308 for (i=1; i<=array_size; i++) {
309 Log(&rLog, LOG_FORCED_DEBUG, "b4 sort array1[%d]=%d array2[%d]=%d", i, array1[i], i, array2[i]);
314 #ifdef SORT_LAST_ELEMENT_AS_WELL
318 ust[1] = array_size-1;
324 if (lst[p] >= ust[p]) {
331 for (i=i+1; array1[i] < pivlin; i++)
333 for (j=j-1; j > i; j--)
334 if (array1[j] <= pivlin) break;
337 array1[i] = array1[j];
341 array2[i] = array2[j];
349 array1[i] = array1[j];
353 array2[i] = array2[j];
356 if (i-lst[p] < ust[p] - i) {
371 for (i=1; i<=array_size; i++) {
372 Log(&rLog, LOG_FORCED_DEBUG, "after sort array1[%d]=%d array2[%d]=%d", i, array1[i], i, array2[i]);
378 /* end of des_quick_sort */
384 * FIXME together with des_quick_sort most time consuming routine
385 * according to gprof on r110
389 pair_align(int seq_no, int l1, int l2, int max_res_code, ktuple_param_t *aln_param,
390 char **seq_array, int *maxsf, int **accum, int max_aln_length,
391 int *zza, int *zzb, int *zzc, int *zzd)
393 int next; /* forrmerly static */
394 int pot[8],i, j, l, m, flag, limit, pos, vn1, vn2, flen, osptr, fs;
395 int tv1, tv2, encrypt, subt1, subt2, rmndr;
401 const int tl1 = (l1+l2)-1;
403 assert(NULL!=aln_param);
406 Log(&rLog, LOG_FORCED_DEBUG, "DNAFLAG=%d seq_no=%d l1=%d l2=%d window=%d ktup=%d signif=%d wind_gap=%d",
407 DNAFLAG, seq_no, l1, l2, window, ktup, signif,
411 slopes = (char *) CKCALLOC(tl1+1, sizeof(char));
412 displ = (int *) CKCALLOC(tl1+1, sizeof(int));
413 diag_index = (int *) CKMALLOC((tl1+1) * sizeof(int));
415 for (i=1; i<=tl1; ++i) {
416 /* unnecessary, because we calloced: slopes[i] = displ[i] = 0; */
420 for (i=1;i<=aln_param->ktup;i++)
421 pot[i] = (int) pow((double)(max_res_code+1),(double)(i-1));
422 limit = (int) pow((double)(max_res_code+1),(double)aln_param->ktup);
426 /* increment diagonal score for each k_tuple match */
428 for (i=1; i<=limit; ++i) {
443 /* choose the top SIGNIF diagonals
446 #ifdef QSORT_REPLACEMENT
447 /* This was an attempt to replace des_quick_sort with qsort(),
448 * which turns out to be much slower, so don't use this
451 /* FIXME: if we use this branch, we don't need to init diag_index
452 * before, because that is done in QSortAndTrackIndex()
456 for (i=1; i<=tl1; i++) {
457 Log(&rLog, LOG_FORCED_DEBUG, "b4 sort disp[%d]=%d diag_index[%d]=%d", i, diag_index[i], i, displ[i]);
461 QSortAndTrackIndex(&(diag_index[1]), &(displ[1]), tl1, 'a', TRUE);
464 for (i=1; i<=tl1; i++) {
465 Log(&rLog, LOG_FORCED_DEBUG, "after sort disp[%d]=%d diag_index[%d]=%d", i, diag_index[i], i, displ[i]);
471 des_quick_sort(displ, diag_index, tl1);
475 j = tl1 - aln_param->signif + 1;
481 /* flag all diagonals within WINDOW of a top diagonal */
483 for (i=tl1; i>=j; i--) {
486 l = (1 > pos - aln_param->window) ?
487 1 : pos - aln_param->window;
488 m = (tl1 < pos + aln_param->window) ?
489 tl1 : pos + aln_param->window;
495 for (i=1; i<=tl1; i++) {
501 for (i=1; i<=(l1-aln_param->ktup+1); ++i) {
503 for (j=1; j<=aln_param->ktup; ++j) {
504 residue = seq_array[seq_no][i+j-1];
505 if ((residue<0) || (residue>max_res_code)) {
509 encrypt += ((residue)*pot[j]);
525 if (1 != slopes[osptr]) {
540 if (curr_frag >= 2*max_aln_length) {
541 Log(&rLog, LOG_VERBOSE, "(Partial alignment)");
542 goto free_and_exit; /* Yesss! Always wanted to
545 displ[osptr]=curr_frag;
546 put_frag(fs, i, vn2, flen, curr_frag, &next, maxsf, accum);
552 if (frag_rel_pos(i, vn2, tv1, tv2, aln_param->ktup)) {
553 if (i-vn2 == accum[1][next]-accum[2][next]) {
554 if (i > accum[1][next]+(aln_param->ktup-1)) {
555 fs = accum[0][next]+aln_param->ktup;
557 rmndr = i-accum[1][next];
558 fs = accum[0][next]+rmndr;
565 if (0 == displ[osptr]) {
566 subt1=aln_param->ktup;
568 if (i > accum[1][displ[osptr]]+(aln_param->ktup-1)) {
569 subt1=accum[0][displ[osptr]]+aln_param->ktup;
571 rmndr=i-accum[1][displ[osptr]];
572 subt1=accum[0][displ[osptr]]+rmndr;
575 subt2=accum[0][next] - aln_param->wind_gap + aln_param->ktup;
608 /* end of pair_align */
614 * Will compute ktuple scores and store in tmat
615 * Following values will be set: tmat[i][j], where
620 * tmat data members have to be preallocated
622 * if ktuple_param_t *aln_param == NULL defaults will be used
625 KTuplePairDist(symmatrix_t *tmat, mseq_t *mseq,
626 int istart, int iend,
627 int jstart, int jend,
628 ktuple_param_t *param_override,
629 progress_t *prProgress,
630 unsigned long int *ulStepNo, unsigned long int ulTotalStepNo)
632 /* this first group of variables were previously static
633 and hence un-parallelisable */
638 /* divide score with length of smallest sequence */
639 int *zza, *zzb, *zzc, *zzd;
640 int private_step_no = 0;
644 int max_res_code = -1;
648 /* progress_t *prProgress; */
649 /* int uStepNo, uTotalStepNo; */
650 ktuple_param_t aln_param = default_protein_param;
651 bool bPrintCR = (rLog.iLogLevelEnabled<=LOG_VERBOSE) ? FALSE : TRUE;
654 if(prProgress == NULL) {
655 NewProgress(&prProgress, LogGetFP(&rLog, LOG_INFO),
656 "Ktuple-distance calculation progress", bPrintCR);
659 /* conversion to old style data types follows
663 seqlen_array = (int*) CKMALLOC((mseq->nseqs+1) * sizeof(int));
664 for (i=0; i<mseq->nseqs; i++) {
665 seqlen_array[i+1] = mseq->sqinfo[i].len;
668 /* setup alignment parameters
670 if (SEQTYPE_PROTEIN == mseq->seqtype) {
672 max_res_code = strlen(AMINO_ACID_CODES)-2;
673 aln_param = default_protein_param;
675 } else if (SEQTYPE_RNA == mseq->seqtype || SEQTYPE_DNA == mseq->seqtype) {
677 max_res_code = strlen(NUCLEIC_ACID_CODES)-2;
678 aln_param = default_dna_param;
681 Log(&rLog, LOG_FATAL, "Internal error in %s: Unknown sequence type.", __FUNCTION__);
684 if (NULL!=param_override) {
685 aln_param.ktup = param_override->ktup;
686 aln_param.wind_gap = param_override->wind_gap;
687 aln_param.signif = param_override->signif;
688 aln_param.window = param_override->window;
691 /*LOG_DEBUG("DNAFLAG = %d max_res_code = %d", DNAFLAG, max_res_code);*/
693 /* convert mseq to clustal's old-style int encoded sequences (unit-offset)
697 seq_array = (char **) CKMALLOC((mseq->nseqs+1) * sizeof(char *));
699 /* FIXME check that non of the seqs is smaller than ktup (?).
700 * Otherwise segfault occurs
702 for (i=0; i<mseq->nseqs; i++) {
703 seq_array[i+1] = (char *) CKMALLOC((seqlen_array[i+1]+2) * sizeof (char));;
705 for (i=0; i<mseq->nseqs; i++) {
706 /*LOG_DEBUG("calling encode with seq_array[%d+1] len=%d and seq=%s",
707 i, seqlen_array[i+1], mseq->seq[i]);*/
708 if (TRUE == DNAFLAG) {
709 encode(&(mseq->seq[i][-1]), seq_array[i+1],
710 seqlen_array[i+1], NUCLEIC_ACID_CODES);
712 encode(&(mseq->seq[i][-1]), seq_array[i+1],
713 seqlen_array[i+1], AMINO_ACID_CODES);
716 if (seqlen_array[i+1]>max_seq_len) {
717 max_seq_len = seqlen_array[i+1];
720 max_aln_length = max_seq_len * 2;
721 /* see sequence.c in old source */
723 /* FIXME: short sequences can cause seg-fault
724 * because max_aln_length can get shorter
725 * than (max_res_code+1)^k
727 max_aln_length = max_aln_length > pow((max_res_code+1), aln_param.ktup)+1 ?
728 max_aln_length : pow((max_res_code+1), aln_param.ktup)+1;
732 * conversion to old style clustal done (in no time) */
735 accum = (int **) CKCALLOC(5, sizeof (int *));
737 accum[i] = (int *) CKCALLOC((2*max_aln_length+1), sizeof(int));
739 zza = (int *) CKCALLOC( (max_aln_length+1), sizeof(int));
740 zzb = (int *) CKCALLOC( (max_aln_length+1), sizeof(int));
741 zzc = (int *) CKCALLOC( (max_aln_length+1), sizeof(int));
742 zzd = (int *) CKCALLOC( (max_aln_length+1), sizeof(int));
744 /* estimation of total number of steps (if istart and jstart are
745 * both 0) (now handled in the calling routine)
747 /* uTotalStepNo = iend*jend - iend*iend/2 + iend/2;
749 /*LOG_DEBUG("istart=%d iend=%d jstart=%d jend=%d", istart, iend, jstart, jend);*/
751 for (i=istart+1; i<=iend; ++i) {
752 /* by definition a sequence compared to itself should give
754 SymMatrixSetValue(tmat, i-1, i-1, 0.0);
755 make_ptrs(zza, zzc, i, seqlen_array[i], aln_param.ktup, max_res_code, seq_array);
758 #pragma omp critical(ktuple)
761 ProgressLog(prProgress, *ulStepNo, ulTotalStepNo, FALSE);
764 for (j=MAX(i+1, jstart+1); j<=jend; ++j) {
767 /*LOG_DEBUG("comparing pair %d:%d", i, j);*/
769 make_ptrs(zzb, zzd, j, seqlen_array[j], aln_param.ktup, max_res_code, seq_array);
770 pair_align(i, seqlen_array[i], seqlen_array[j], max_res_code, &aln_param,
771 seq_array, &maxsf, accum, max_aln_length, zza, zzb, zzc, zzd);
776 calc_score=(double)accum[0][maxsf];
778 dsr=(seqlen_array[i]<seqlen_array[j]) ?
779 seqlen_array[i] : seqlen_array[j];
780 calc_score = (calc_score/(double)dsr) * 100.0;
784 /* printf("%d %d %d\n", i-1, j-1, (100.0 - calc_score)/100.0); */
785 SymMatrixSetValue(tmat, i-1, j-1, (100.0 - calc_score)/100.0);
787 /* the function allows you not to compute the full matrix.
788 * here we explicitely make the resulting matrix a
789 * rectangle, i.e. we always set full rows. in other
790 * words, if we don't complete the full matrix then we
791 * don't have a full symmetry. so only use the defined
794 /*LOG_DEBUG("setting %d : %d = %f", j, i, tmat[i][j]);*/
795 /* not needed anymore since we use symmatrix_t
797 tmat[j][i] = tmat[i][j];
801 #pragma omp critical(ktuple)
804 Log(&rLog, LOG_DEBUG, "K-tuple distance for sequence pair %d:%d = %lg",
805 i, j, SymMatrixGetValue(tmat, i-1, j-1));
810 Log(&rLog, LOG_FORCED_DEBUG, "uTotalStepNo=%d for istart=%d iend=%d jstart=%d jend=%d", uStepNo, istart, iend, jstart, jend);
811 Log(&rLog, LOG_FORCED_DEBUG, "Fabian = %d", iend*jend - iend*iend/2 + iend/2);
814 /* printf("\n\n%d\t%d\t%d\t%d\n\n", omp_get_thread_num(), uStepNo, istart, iend); */
822 #pragma omp critical(ktuple)
825 printf("steps: %d\n", private_step_no);
837 for (i=1; i<=mseq->nseqs; i++) {
838 CKFREE(seq_array[i]);
842 /* end of KTuplePairDist */