/* -*- mode: c; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */ /********************************************************************* * Clustal Omega - Multiple sequence alignment * * Copyright (C) 2010 University College Dublin * * Clustal-Omega is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This file is part of Clustal-Omega. * ********************************************************************/ /* * RCS $Id: ktuple_pair.c 230 2011-04-09 15:37:50Z andreas $ * * * K-Tuple code for pairwise alignment (Wilbur and Lipman, 1983; PMID * 6572363). Most code taken from showpair.c (Clustal 1.83) * DD: some functions now have lots of parameters as static variables * were removed to make code OpenMP-friendly * */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #include #include #include #ifdef HAVE_OPENMP #include #endif #include "squid/squid.h" #include "util.h" #include "symmatrix.h" #include "ktuple_pair.h" #include "log.h" #include "progress.h" #define END_MARK -3 /* see interface.c in 1.83 */ #define NUMRES 32 /* max size of comparison matrix */ /* see notes below */ #undef SORT_LAST_ELEMENT_AS_WELL /* gap_pos1 = NUMRES-2; /@ code for gaps inserted by clustalw @/ */ static const int GAP_POS2 = NUMRES-1; /* code for gaps already in alignment */ static bool DNAFLAG = FALSE; static const char *AMINO_ACID_CODES = "ABCDEFGHIKLMNPQRSTUVWXYZ-"; static const char *NUCLEIC_ACID_CODES = "ACGTUN-"; /* As far as I understand the gap symbol should not be necessary here, * because we use isgap for testing later anyway. But changing this, * will affect max_res_code and max_nuc as well. So I leave it for now * as it is. AW */ static bool percent = TRUE; static void make_ptrs(int *tptr, int *pl, const int naseq, const int l, const int ktup, const int max_res_code, char **seq_array); static void put_frag(const int fs, const int v1, const int v2, const int flen, const int curr_frag, int *next, int *maxsf, int **accum); static bool frag_rel_pos(int a1, int b1, int a2, int b2, int ktup); static void des_quick_sort(int *array1, int *array2, const int array_size); static void pair_align(int seq_no, int l1, int l2, int max_res_code, ktuple_param_t *aln_param, char **seq_array, int *maxsf, int **accum, int max_aln_length, int *zza, int *zzb, int *zzc, int *zzd); static void encode(char *seq, char *naseq, int l, const char *res_codes); static int res_index(const char *lookup, char c); typedef struct { int i1; int i2; } two_ints_t; /* default ktuple pairwise alignment parameters * */ /* protein */ /* designated initializer */ const ktuple_param_t default_protein_param = { .ktup = 1, .wind_gap = 3, .signif = 5, .window = 5, }; /* dna */ /* designated initializer */ const ktuple_param_t default_dna_param = { .ktup = 2, .wind_gap = 5, .signif = 4, .window = 4, }; /** * note: naseq should be unit-offset */ static void encode(char *seq, char *naseq, int l, const char *res_codes) { /* code seq as ints .. use GAP_POS2 for gap */ register int i; bool seq_contains_unknown_char = FALSE; /*LOG_DEBUG("seq=%s naseq=%p l=%d", &(seq[1]), naseq, l); */ for (i=1; i<=l; i++) { char res = toupper(seq[i]); if (isgap(res)) { naseq[i] = GAP_POS2; /* gap in input */ } else { naseq[i] = res_index(res_codes, res); } /*LOG_DEBUG("Character '%c' at pos %d", res, i);*/ if (-1 == naseq[i]) { seq_contains_unknown_char = TRUE; /*LOG_DEBUG("Unknown character '%c' at pos %d", res, i);*/ } /*LOG_DEBUG("na_seq[%d]=%d", i, naseq[i]);*/ } if (TRUE == seq_contains_unknown_char) Log(&rLog, LOG_WARN, "Unknown character in seq '%s'", &(seq[1])); naseq[i] = END_MARK; return; } /* end of encode */ /** * */ static int res_index(const char *t, char c) { register int i; for (i=0; t[i] && t[i] != c; i++) ; if (t[i]) { return (i); } else { return -1; } } /* end of res_index */ /** * */ static void make_ptrs(int *tptr, int *pl, const int naseq, const int l, const int ktup, const int max_res_code, char **seq_array) { /* FIXME make 10 a constant and give it a nice name */ static int a[10]; int i, j, code, flag; char residue; const int limit = (int) pow((double)(max_res_code+1),(double)ktup); for (i=1;i<=ktup;i++) a[i] = (int) pow((double)(max_res_code+1),(double)(i-1)); for (i=1; i<=limit; ++i) pl[i]=0; for (i=1; i<=l; ++i) tptr[i]=0; for (i=1; i<=(l-ktup+1); ++i) { code=0; flag=FALSE; for (j=1; j<=ktup; ++j) { /* Log(&rLog, LOG_FORCED_DEBUG, "naseq=%d i=%d j=%d seq_array[naseq]=%p", * naseq, i, j, seq_array[naseq]); */ residue = seq_array[naseq][i+j-1]; /* Log(&rLog, LOG_FORCED_DEBUG, "residue = %d", residue); */ if ((residue<0) || (residue > max_res_code)){ flag=TRUE; break; } code += ((residue) * a[j]); } if (flag) continue; ++code; if (0 != pl[code]) tptr[i] =pl[code]; pl[code] = i; } return; } /* end of make_ptrs */ /** * * FIXME Why hardcoding of 5? */ static void put_frag(const int fs, const int v1, const int v2, const int flen, const int curr_frag, int *next, int *maxsf, int **accum) { int end; accum[0][curr_frag]=fs; accum[1][curr_frag]=v1; accum[2][curr_frag]=v2; accum[3][curr_frag]=flen; if (!*maxsf) { *maxsf=1; accum[4][curr_frag]=0; return; } if (fs >= accum[0][*maxsf]) { accum[4][curr_frag]=*maxsf; *maxsf=curr_frag; return; } else { *next=*maxsf; while (TRUE) { end=*next; *next=accum[4][*next]; if (fs>=accum[0][*next]) break; } accum[4][curr_frag]=*next; accum[4][end]=curr_frag; } return; } /* end of put_frag */ /** * */ static bool frag_rel_pos(int a1, int b1, int a2, int b2, int ktup) { if (a1-b1 == a2-b2) { if (a2 0) { if (lst[p] >= ust[p]) { p--; } else { i = lst[p] - 1; j = ust[p]; pivlin = array1[j]; while (i < j) { for (i=i+1; array1[i] < pivlin; i++) ; for (j=j-1; j > i; j--) if (array1[j] <= pivlin) break; if (i < j) { temp1 = array1[i]; array1[i] = array1[j]; array1[j] = temp1; temp2 = array2[i]; array2[i] = array2[j]; array2[j] = temp2; } } j = ust[p]; temp1 = array1[i]; array1[i] = array1[j]; array1[j] = temp1; temp2 = array2[i]; array2[i] = array2[j]; array2[j] = temp2; if (i-lst[p] < ust[p] - i) { lst[p+1] = lst[p]; ust[p+1] = i - 1; lst[p] = i + 1; } else { lst[p+1] = i + 1; ust[p+1] = ust[p]; ust[p] = i - 1; } p = p + 1; } } #if 0 for (i=1; i<=array_size; i++) { Log(&rLog, LOG_FORCED_DEBUG, "after sort array1[%d]=%d array2[%d]=%d", i, array1[i], i, array2[i]); } #endif return; } /* end of des_quick_sort */ /** * * FIXME together with des_quick_sort most time consuming routine * according to gprof on r110 * */ static void pair_align(int seq_no, int l1, int l2, int max_res_code, ktuple_param_t *aln_param, char **seq_array, int *maxsf, int **accum, int max_aln_length, int *zza, int *zzb, int *zzc, int *zzd) { int next; /* forrmerly static */ int pot[8],i, j, l, m, flag, limit, pos, vn1, vn2, flen, osptr, fs; int tv1, tv2, encrypt, subt1, subt2, rmndr; char residue; int *diag_index; int *displ; char *slopes; int curr_frag; const int tl1 = (l1+l2)-1; assert(NULL!=aln_param); /* Log(&rLog, LOG_FORCED_DEBUG, "DNAFLAG=%d seq_no=%d l1=%d l2=%d window=%d ktup=%d signif=%d wind_gap=%d", DNAFLAG, seq_no, l1, l2, window, ktup, signif, wind_gap); */ slopes = (char *) CKCALLOC(tl1+1, sizeof(char)); displ = (int *) CKCALLOC(tl1+1, sizeof(int)); diag_index = (int *) CKMALLOC((tl1+1) * sizeof(int)); for (i=1; i<=tl1; ++i) { /* unnecessary, because we calloced: slopes[i] = displ[i] = 0; */ diag_index[i] = i; } for (i=1;i<=aln_param->ktup;i++) pot[i] = (int) pow((double)(max_res_code+1),(double)(i-1)); limit = (int) pow((double)(max_res_code+1),(double)aln_param->ktup); /* increment diagonal score for each k_tuple match */ for (i=1; i<=limit; ++i) { vn1=zzc[i]; while (TRUE) { if (!vn1) break; vn2 = zzd[i]; while (0 != vn2) { osptr = vn1-vn2+l2; ++displ[osptr]; vn2 = zzb[vn2]; } vn1=zza[vn1]; } } /* choose the top SIGNIF diagonals */ #ifdef QSORT_REPLACEMENT /* This was an attempt to replace des_quick_sort with qsort(), * which turns out to be much slower, so don't use this */ /* FIXME: if we use this branch, we don't need to init diag_index * before, because that is done in QSortAndTrackIndex() * automatically. */ #if 0 for (i=1; i<=tl1; i++) { Log(&rLog, LOG_FORCED_DEBUG, "b4 sort disp[%d]=%d diag_index[%d]=%d", i, diag_index[i], i, displ[i]); } #endif QSortAndTrackIndex(&(diag_index[1]), &(displ[1]), tl1, 'a', TRUE); #if 0 for (i=1; i<=tl1; i++) { Log(&rLog, LOG_FORCED_DEBUG, "after sort disp[%d]=%d diag_index[%d]=%d", i, diag_index[i], i, displ[i]); } #endif #else des_quick_sort(displ, diag_index, tl1); #endif j = tl1 - aln_param->signif + 1; if (j < 1) { j = 1; } /* flag all diagonals within WINDOW of a top diagonal */ for (i=tl1; i>=j; i--) { if (displ[i] > 0) { pos = diag_index[i]; l = (1 > pos - aln_param->window) ? 1 : pos - aln_param->window; m = (tl1 < pos + aln_param->window) ? tl1 : pos + aln_param->window; for (; l <= m; l++) slopes[l] = 1; } } for (i=1; i<=tl1; i++) { displ[i] = 0; } curr_frag=*maxsf=0; for (i=1; i<=(l1-aln_param->ktup+1); ++i) { encrypt=flag=0; for (j=1; j<=aln_param->ktup; ++j) { residue = seq_array[seq_no][i+j-1]; if ((residue<0) || (residue>max_res_code)) { flag=TRUE; break; } encrypt += ((residue)*pot[j]); } if (flag) { continue; } ++encrypt; vn2=zzd[encrypt]; flag=FALSE; while (TRUE) { if (!vn2) { flag=TRUE; break; } osptr=i-vn2+l2; if (1 != slopes[osptr]) { vn2=zzb[vn2]; continue; } flen=0; fs=aln_param->ktup; next=*maxsf; /* * A-loop */ while (TRUE) { if (!next) { ++curr_frag; if (curr_frag >= 2*max_aln_length) { Log(&rLog, LOG_VERBOSE, "(Partial alignment)"); goto free_and_exit; /* Yesss! Always wanted to * use a goto (AW) */ } displ[osptr]=curr_frag; put_frag(fs, i, vn2, flen, curr_frag, &next, maxsf, accum); } else { tv1=accum[1][next]; tv2=accum[2][next]; if (frag_rel_pos(i, vn2, tv1, tv2, aln_param->ktup)) { if (i-vn2 == accum[1][next]-accum[2][next]) { if (i > accum[1][next]+(aln_param->ktup-1)) { fs = accum[0][next]+aln_param->ktup; } else { rmndr = i-accum[1][next]; fs = accum[0][next]+rmndr; } flen=next; next=0; continue; } else { if (0 == displ[osptr]) { subt1=aln_param->ktup; } else { if (i > accum[1][displ[osptr]]+(aln_param->ktup-1)) { subt1=accum[0][displ[osptr]]+aln_param->ktup; } else { rmndr=i-accum[1][displ[osptr]]; subt1=accum[0][displ[osptr]]+rmndr; } } subt2=accum[0][next] - aln_param->wind_gap + aln_param->ktup; if (subt2>subt1) { flen=next; fs=subt2; } else { flen=displ[osptr]; fs=subt1; } next=0; continue; } } else { next=accum[4][next]; continue; } } break; } /* * End of Aloop */ vn2=zzb[vn2]; } } free_and_exit: CKFREE(displ); CKFREE(slopes); CKFREE(diag_index); return; } /* end of pair_align */ /** * * Will compute ktuple scores and store in tmat * Following values will be set: tmat[i][j], where * istart <= i nseqs+1) * sizeof(int)); for (i=0; inseqs; i++) { seqlen_array[i+1] = mseq->sqinfo[i].len; } /* setup alignment parameters */ if (SEQTYPE_PROTEIN == mseq->seqtype) { DNAFLAG = FALSE; max_res_code = strlen(AMINO_ACID_CODES)-2; aln_param = default_protein_param; } else if (SEQTYPE_RNA == mseq->seqtype || SEQTYPE_DNA == mseq->seqtype) { DNAFLAG = TRUE; max_res_code = strlen(NUCLEIC_ACID_CODES)-2; aln_param = default_dna_param; } else { Log(&rLog, LOG_FATAL, "Internal error in %s: Unknown sequence type.", __FUNCTION__); } if (NULL!=param_override) { aln_param.ktup = param_override->ktup; aln_param.wind_gap = param_override->wind_gap; aln_param.signif = param_override->signif; aln_param.window = param_override->window; } /*LOG_DEBUG("DNAFLAG = %d max_res_code = %d", DNAFLAG, max_res_code);*/ /* convert mseq to clustal's old-style int encoded sequences (unit-offset) */ max_aln_length = 0; max_seq_len = 0; seq_array = (char **) CKMALLOC((mseq->nseqs+1) * sizeof(char *)); seq_array[0] = NULL; /* FIXME check that non of the seqs is smaller than ktup (?). * Otherwise segfault occurs */ for (i=0; inseqs; i++) { seq_array[i+1] = (char *) CKMALLOC((seqlen_array[i+1]+2) * sizeof (char));; } for (i=0; inseqs; i++) { /*LOG_DEBUG("calling encode with seq_array[%d+1] len=%d and seq=%s", i, seqlen_array[i+1], mseq->seq[i]);*/ if (TRUE == DNAFLAG) { encode(&(mseq->seq[i][-1]), seq_array[i+1], seqlen_array[i+1], NUCLEIC_ACID_CODES); } else { encode(&(mseq->seq[i][-1]), seq_array[i+1], seqlen_array[i+1], AMINO_ACID_CODES); } if (seqlen_array[i+1]>max_seq_len) { max_seq_len = seqlen_array[i+1]; } } max_aln_length = max_seq_len * 2; /* see sequence.c in old source */ /* FIXME: short sequences can cause seg-fault * because max_aln_length can get shorter * than (max_res_code+1)^k * FS, r222->r223 */ max_aln_length = max_aln_length > pow((max_res_code+1), aln_param.ktup)+1 ? max_aln_length : pow((max_res_code+1), aln_param.ktup)+1; /* * * conversion to old style clustal done (in no time) */ accum = (int **) CKCALLOC(5, sizeof (int *)); for (i=0;i<5;i++) { accum[i] = (int *) CKCALLOC((2*max_aln_length+1), sizeof(int)); } zza = (int *) CKCALLOC( (max_aln_length+1), sizeof(int)); zzb = (int *) CKCALLOC( (max_aln_length+1), sizeof(int)); zzc = (int *) CKCALLOC( (max_aln_length+1), sizeof(int)); zzd = (int *) CKCALLOC( (max_aln_length+1), sizeof(int)); /* estimation of total number of steps (if istart and jstart are * both 0) (now handled in the calling routine) */ /* uTotalStepNo = iend*jend - iend*iend/2 + iend/2; uStepNo = 0; */ /*LOG_DEBUG("istart=%d iend=%d jstart=%d jend=%d", istart, iend, jstart, jend);*/ for (i=istart+1; i<=iend; ++i) { /* by definition a sequence compared to itself should give a score of 0. AW */ SymMatrixSetValue(tmat, i-1, i-1, 0.0); make_ptrs(zza, zzc, i, seqlen_array[i], aln_param.ktup, max_res_code, seq_array); #ifdef HAVE_OPENMP #pragma omp critical(ktuple) #endif { ProgressLog(prProgress, *ulStepNo, ulTotalStepNo, FALSE); } for (j=MAX(i+1, jstart+1); j<=jend; ++j) { (*ulStepNo)++; private_step_no++; /*LOG_DEBUG("comparing pair %d:%d", i, j);*/ make_ptrs(zzb, zzd, j, seqlen_array[j], aln_param.ktup, max_res_code, seq_array); pair_align(i, seqlen_array[i], seqlen_array[j], max_res_code, &aln_param, seq_array, &maxsf, accum, max_aln_length, zza, zzb, zzc, zzd); if (!maxsf) { calc_score=0.0; } else { calc_score=(double)accum[0][maxsf]; if (percent) { dsr=(seqlen_array[i]nseqs; i++) { CKFREE(seq_array[i]); } CKFREE(seq_array); } /* end of KTuplePairDist */