binaries/src/clustalo/src/clustal/ktuple_pair.c

   1 /* -*- mode: c; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
   2
   3 /*********************************************************************
   4  * Clustal Omega - Multiple sequence alignment
   5  *
   6  * Copyright (C) 2010 University College Dublin
   7  *
   8  * Clustal-Omega is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU General Public License as
  10  * published by the Free Software Foundation; either version 2 of the
  11  * License, or (at your option) any later version.
  12  *
  13  * This file is part of Clustal-Omega.
  14  *
  15  ********************************************************************/
  16
  17 /*
  18  *  RCS $Id: ktuple_pair.c 230 2011-04-09 15:37:50Z andreas $
  19  *
  20  *
  21  * K-Tuple code for pairwise alignment (Wilbur and Lipman, 1983; PMID
  22  * 6572363). Most code taken from showpair.c (Clustal 1.83)
  23  * DD: some functions now have lots of parameters as static variables
  24  * were removed to make code OpenMP-friendly
  25  *
  26  */
  27
  28 #ifdef HAVE_CONFIG_H
  29 #include "config.h"
  30 #endif
  31
  32
  33 #include <stdio.h>
  34 #include <string.h>
  35 #include <ctype.h>
  36 #include <stdlib.h>
  37 #include <math.h>
  38 #include <assert.h>
  39
  40 #ifdef HAVE_OPENMP
  41 #include <omp.h>
  42 #endif
  43
  44 #include "squid/squid.h"
  45 #include "util.h"
  46 #include "symmatrix.h"
  47 #include "ktuple_pair.h"
  48 #include "log.h"
  49 #include "progress.h"
  50
  51 #define END_MARK -3 /* see interface.c in 1.83 */
  52 #define NUMRES 32 /* max size of comparison matrix */
  53
  54 /* see notes below */
  55 #undef SORT_LAST_ELEMENT_AS_WELL
  56
  57 /* gap_pos1 = NUMRES-2; /@ code for gaps inserted by clustalw @/ */
  58 static const int GAP_POS2 = NUMRES-1; /* code for gaps already in alignment */
  59 static bool DNAFLAG = FALSE;
  60
  61 static const char *AMINO_ACID_CODES = "ABCDEFGHIKLMNPQRSTUVWXYZ-";
  62 static const char *NUCLEIC_ACID_CODES = "ACGTUN-";
  63 /* As far as I understand the gap symbol should not be necessary here,
  64  * because we use isgap for testing later anyway. But changing this,
  65  * will affect max_res_code and max_nuc as well. So I leave it for now
  66  * as it is. AW
  67  */
  68
  69 static bool percent = TRUE;
  70
  71 static void make_ptrs(int *tptr, int *pl, const int naseq, const int l, const int ktup, const int max_res_code, char **seq_array);
  72 static void put_frag(const int fs, const int v1, const int v2, const int flen, const int curr_frag, int *next, int *maxsf, int **accum);
  73 static bool frag_rel_pos(int a1, int b1, int a2, int b2, int ktup);
  74 static void des_quick_sort(int *array1, int *array2, const int array_size);
  75 static void pair_align(int seq_no, int l1, int l2, int max_res_code, ktuple_param_t *aln_param,
  76     char **seq_array, int *maxsf, int **accum, int max_aln_length,
  77     int *zza, int *zzb, int *zzc, int *zzd);
  78 static void encode(char *seq, char *naseq, int l, const char *res_codes);
  79 static int res_index(const char *lookup, char c);
  80
  81
  82 typedef struct {
  83     int i1;
  84     int i2;
  85 } two_ints_t;
  86
  87
  88
  89 /* default ktuple pairwise alignment parameters
  90  *
  91  */
  92 /* protein
  93  */
  94 /* designated initializer */
  95 const ktuple_param_t default_protein_param = {
  96      .ktup = 1,
  97      .wind_gap = 3,
  98      .signif = 5,
  99      .window = 5,
 100 };
 101 /* dna
 102  */
 103 /* designated initializer */
 104 const ktuple_param_t default_dna_param = {
 105     .ktup = 2,
 106     .wind_gap = 5,
 107     .signif = 4,
 108     .window = 4,
 109 };
 110
 111
 112 /**
 113  * note: naseq should be unit-offset
 114  */
 115 static void
 116 encode(char *seq, char *naseq, int l, const char *res_codes)
 117 {
 118     /* code seq as ints .. use GAP_POS2 for gap */
 119     register int i;
 120     bool seq_contains_unknown_char = FALSE;
 121     /*LOG_DEBUG("seq=%s naseq=%p l=%d", &(seq[1]), naseq, l); */
 122
 123
 124     for (i=1; i<=l; i++) {
 125         char res = toupper(seq[i]);
 126         if (isgap(res)) {
 127             naseq[i] = GAP_POS2; /* gap in input */
 128         } else {
 129             naseq[i] = res_index(res_codes, res);
 130         }
 131
 132         /*LOG_DEBUG("Character '%c' at pos %d", res, i);*/
 133         if (-1 == naseq[i]) {
 134             seq_contains_unknown_char = TRUE;
 135             /*LOG_DEBUG("Unknown character '%c' at pos %d", res, i);*/
 136         }
 137         /*LOG_DEBUG("na_seq[%d]=%d", i, naseq[i]);*/
 138     }
 139
 140     if (TRUE == seq_contains_unknown_char)
 141         Log(&rLog, LOG_WARN, "Unknown character in seq '%s'", &(seq[1]));
 142
 143     naseq[i] = END_MARK;
 144
 145     return;
 146 }
 147 /* end of encode */
 148
 149
 150 /**
 151  *
 152  */
 153 static int
 154 res_index(const char *t, char c)
 155 {
 156     register int i;
 157     for (i=0; t[i] && t[i] != c; i++)
 158         ;
 159     if (t[i]) {
 160         return (i);
 161     } else {
 162         return -1;
 163     }
 164 }
 165 /* end of res_index */
 166
 167
 168 /**
 169  *
 170  */
 171 static void
 172 make_ptrs(int *tptr, int *pl, const int naseq, const int l, const int ktup, const int max_res_code, char **seq_array)
 173 {
 174     /* FIXME make 10 a constant and give it a nice name */
 175     static int a[10];
 176     int i, j, code, flag;
 177     char residue;
 178     const int limit = (int) pow((double)(max_res_code+1),(double)ktup);
 179
 180     for (i=1;i<=ktup;i++)
 181         a[i] = (int) pow((double)(max_res_code+1),(double)(i-1));
 182
 183     for (i=1; i<=limit; ++i)
 184         pl[i]=0;
 185     for (i=1; i<=l; ++i)
 186         tptr[i]=0;
 187
 188     for (i=1; i<=(l-ktup+1); ++i) {
 189         code=0;
 190         flag=FALSE;
 191         for (j=1; j<=ktup; ++j) {
 192             /* Log(&rLog, LOG_FORCED_DEBUG, "naseq=%d i=%d j=%d seq_array[naseq]=%p",
 193              * naseq, i, j, seq_array[naseq]);
 194              */
 195             residue = seq_array[naseq][i+j-1];
 196             /* Log(&rLog, LOG_FORCED_DEBUG, "residue = %d", residue); */
 197             if ((residue<0) || (residue > max_res_code)){
 198                 flag=TRUE;
 199                 break;
 200             }
 201             code += ((residue) * a[j]);
 202         }
 203         if (flag)
 204             continue;
 205         ++code;
 206         if (0 != pl[code])
 207             tptr[i] =pl[code];
 208         pl[code] = i;
 209     }
 210
 211     return;
 212 }
 213 /* end of make_ptrs */
 214
 215
 216 /**
 217  *
 218  * FIXME Why hardcoding of 5?
 219  */
 220 static void
 221 put_frag(const int fs, const int v1, const int v2, const int flen, const int curr_frag, int *next, int *maxsf, int **accum)
 222 {
 223     int end;
 224     accum[0][curr_frag]=fs;
 225     accum[1][curr_frag]=v1;
 226     accum[2][curr_frag]=v2;
 227     accum[3][curr_frag]=flen;
 228
 229     if (!*maxsf) {
 230         *maxsf=1;
 231         accum[4][curr_frag]=0;
 232         return;
 233     }
 234
 235     if (fs >= accum[0][*maxsf]) {
 236         accum[4][curr_frag]=*maxsf;
 237         *maxsf=curr_frag;
 238         return;
 239     } else {
 240         *next=*maxsf;
 241         while (TRUE) {
 242             end=*next;
 243             *next=accum[4][*next];
 244             if (fs>=accum[0][*next])
 245                 break;
 246         }
 247         accum[4][curr_frag]=*next;
 248         accum[4][end]=curr_frag;
 249     }
 250
 251     return;
 252 }
 253 /* end of put_frag */
 254
 255
 256 /**
 257  *
 258  */
 259 static bool
 260 frag_rel_pos(int a1, int b1, int a2, int b2, int ktup)
 261 {
 262     if (a1-b1 == a2-b2) {
 263         if (a2<a1) {
 264             return TRUE;
 265         }
 266     } else {
 267         if (a2+ktup-1<a1 && b2+ktup-1<b1) {
 268             return TRUE;
 269         }
 270     }
 271     return FALSE;
 272 }
 273 /* end of frag_rel_pos */
 274
 275
 276
 277
 278 /**
 279  *
 280  * @note: This is together with des_quick_sort most time consuming
 281  * routine according to gprof on r110. Tried to replace it with qsort
 282  * and/or QSortAndTrackIndex(), which is always slower! So we keep the
 283  * original.
 284  *
 285  * Original doc: Quicksort routine, adapted from chapter 4, page 115
 286  * of software tools by Kernighan and Plauger, (1986). Sort the
 287  * elements of array1 and sort the elements of array2 accordingly
 288  *
 289  * There might be a bug here. The original function apparently never
 290  * touches the last element and keeps it as is. Tried to fix this (see
 291  * SORT_LAST_ELEMENT_AS_WELL) which gives slightly worse performance
 292  * (-0.5% on BB). My fix might not be working or it's not a bug at
 293  * all...
 294  *
 295  *
 296  *
 297  */
 298 static void
 299 des_quick_sort(int *array1, int *array2, const int array_size)
 300 {
 301     int temp1, temp2;
 302     int p, pivlin;
 303     int i, j;
 304     int lst[50], ust[50];       /* the maximum no. of elements must be*/
 305                                 /* < log(base2) of 50 */
 306
 307 #if 0
 308     for (i=1; i<=array_size; i++) {
 309         Log(&rLog, LOG_FORCED_DEBUG, "b4 sort array1[%d]=%d array2[%d]=%d", i, array1[i], i, array2[i]);
 310     }
 311 #endif
 312     lst[1] = 1;
 313
 314 #ifdef SORT_LAST_ELEMENT_AS_WELL
 315     ust[1] = array_size;
 316 #else
 317     /* original */
 318     ust[1] = array_size-1;
 319 #endif
 320     p = 1;
 321
 322
 323     while (p > 0) {
 324         if (lst[p] >= ust[p]) {
 325             p--;
 326         } else {
 327             i = lst[p] - 1;
 328             j = ust[p];
 329             pivlin = array1[j];
 330             while (i < j) {
 331                 for (i=i+1; array1[i] < pivlin; i++)
 332                     ;
 333                 for (j=j-1; j > i; j--)
 334                     if (array1[j] <= pivlin) break;
 335                 if (i < j) {
 336                     temp1     = array1[i];
 337                     array1[i] = array1[j];
 338                     array1[j] = temp1;
 339
 340                     temp2     = array2[i];
 341                     array2[i] = array2[j];
 342                     array2[j] = temp2;
 343                 }
 344             }
 345
 346             j = ust[p];
 347
 348             temp1     = array1[i];
 349             array1[i] = array1[j];
 350             array1[j] = temp1;
 351
 352             temp2     = array2[i];
 353             array2[i] = array2[j];
 354             array2[j] = temp2;
 355
 356             if (i-lst[p] < ust[p] - i) {
 357                 lst[p+1] = lst[p];
 358                 ust[p+1] = i - 1;
 359                 lst[p]   = i + 1;
 360
 361             } else {
 362                 lst[p+1] = i + 1;
 363                 ust[p+1] = ust[p];
 364                 ust[p]   = i - 1;
 365             }
 366             p = p + 1;
 367         }
 368     }
 369
 370 #if 0
 371     for (i=1; i<=array_size; i++) {
 372         Log(&rLog, LOG_FORCED_DEBUG, "after sort array1[%d]=%d array2[%d]=%d", i, array1[i], i, array2[i]);
 373     }
 374 #endif
 375
 376     return;
 377 }
 378 /* end of des_quick_sort */
 379
 380
 381
 382 /**
 383  *
 384  * FIXME together with des_quick_sort most time consuming routine
 385  * according to gprof on r110
 386  *
 387  */
 388 static void
 389 pair_align(int seq_no, int l1, int l2, int max_res_code, ktuple_param_t *aln_param,
 390     char **seq_array, int *maxsf, int **accum, int max_aln_length,
 391     int *zza, int *zzb, int *zzc, int *zzd)
 392 {
 393     int next; /* forrmerly static */
 394     int pot[8],i, j, l, m, flag, limit, pos, vn1, vn2, flen, osptr, fs;
 395     int tv1, tv2, encrypt, subt1, subt2, rmndr;
 396     char residue;
 397     int *diag_index;
 398     int *displ;
 399     char *slopes;
 400     int curr_frag;
 401     const int tl1 = (l1+l2)-1;
 402
 403     assert(NULL!=aln_param);
 404
 405     /*
 406       Log(&rLog, LOG_FORCED_DEBUG, "DNAFLAG=%d seq_no=%d l1=%d l2=%d window=%d ktup=%d signif=%d wind_gap=%d",
 407       DNAFLAG, seq_no, l1, l2, window, ktup, signif,
 408       wind_gap);
 409     */
 410
 411     slopes = (char *) CKCALLOC(tl1+1, sizeof(char));
 412     displ = (int *) CKCALLOC(tl1+1, sizeof(int));
 413     diag_index = (int *) CKMALLOC((tl1+1) * sizeof(int));
 414
 415     for (i=1; i<=tl1; ++i) {
 416         /* unnecessary, because we calloced: slopes[i] = displ[i] = 0; */
 417         diag_index[i] = i;
 418     }
 419
 420     for (i=1;i<=aln_param->ktup;i++)
 421         pot[i] = (int) pow((double)(max_res_code+1),(double)(i-1));
 422     limit = (int) pow((double)(max_res_code+1),(double)aln_param->ktup);
 423
 424
 425
 426     /* increment diagonal score for each k_tuple match */
 427
 428     for (i=1; i<=limit; ++i) {
 429         vn1=zzc[i];
 430         while (TRUE) {
 431             if (!vn1) break;
 432             vn2 = zzd[i];
 433             while (0 != vn2) {
 434                 osptr = vn1-vn2+l2;
 435                 ++displ[osptr];
 436                 vn2 = zzb[vn2];
 437             }
 438             vn1=zza[vn1];
 439         }
 440     }
 441
 442
 443     /* choose the top SIGNIF diagonals
 444      */
 445
 446 #ifdef QSORT_REPLACEMENT
 447     /* This was an attempt to replace des_quick_sort with qsort(),
 448      * which turns out to be much slower, so don't use this
 449      */
 450
 451     /* FIXME: if we use this branch, we don't need to init diag_index
 452      * before, because that is done in QSortAndTrackIndex()
 453      * automatically.
 454      */
 455 #if 0
 456     for (i=1; i<=tl1; i++) {
 457         Log(&rLog, LOG_FORCED_DEBUG, "b4 sort disp[%d]=%d diag_index[%d]=%d", i, diag_index[i], i, displ[i]);
 458     }
 459 #endif
 460
 461     QSortAndTrackIndex(&(diag_index[1]), &(displ[1]), tl1, 'a', TRUE);
 462
 463 #if 0
 464     for (i=1; i<=tl1; i++) {
 465         Log(&rLog, LOG_FORCED_DEBUG, "after sort disp[%d]=%d diag_index[%d]=%d", i, diag_index[i], i, displ[i]);
 466     }
 467 #endif
 468
 469 #else
 470
 471     des_quick_sort(displ, diag_index, tl1);
 472
 473 #endif
 474
 475     j = tl1 - aln_param->signif + 1;
 476
 477     if (j < 1) {
 478         j = 1;
 479     }
 480
 481     /* flag all diagonals within WINDOW of a top diagonal */
 482
 483     for (i=tl1; i>=j; i--)  {
 484         if (displ[i] > 0) {
 485             pos = diag_index[i];
 486             l = (1   > pos - aln_param->window) ?
 487                 1 :  pos - aln_param->window;
 488             m = (tl1 < pos + aln_param->window) ?
 489                 tl1 : pos + aln_param->window;
 490             for (; l <= m; l++)
 491                 slopes[l] = 1;
 492         }
 493     }
 494
 495     for (i=1; i<=tl1; i++) {
 496         displ[i] = 0;
 497     }
 498
 499     curr_frag=*maxsf=0;
 500
 501     for (i=1; i<=(l1-aln_param->ktup+1); ++i) {
 502         encrypt=flag=0;
 503         for (j=1; j<=aln_param->ktup; ++j) {
 504             residue = seq_array[seq_no][i+j-1];
 505             if ((residue<0) || (residue>max_res_code)) {
 506                 flag=TRUE;
 507                 break;
 508             }
 509             encrypt += ((residue)*pot[j]);
 510         }
 511         if (flag) {
 512             continue;
 513         }
 514         ++encrypt;
 515
 516         vn2=zzd[encrypt];
 517
 518         flag=FALSE;
 519         while (TRUE) {
 520             if (!vn2) {
 521                 flag=TRUE;
 522                 break;
 523             }
 524             osptr=i-vn2+l2;
 525             if (1 != slopes[osptr]) {
 526                 vn2=zzb[vn2];
 527                 continue;
 528             }
 529             flen=0;
 530             fs=aln_param->ktup;
 531             next=*maxsf;
 532
 533             /*
 534              * A-loop
 535              */
 536
 537             while (TRUE) {
 538                 if (!next) {
 539                     ++curr_frag;
 540                     if (curr_frag >= 2*max_aln_length) {
 541                         Log(&rLog, LOG_VERBOSE, "(Partial alignment)");
 542                         goto free_and_exit; /* Yesss! Always wanted to
 543                                              * use a goto (AW) */
 544                     }
 545                     displ[osptr]=curr_frag;
 546                     put_frag(fs, i, vn2, flen, curr_frag, &next, maxsf, accum);
 547
 548                 } else {
 549                     tv1=accum[1][next];
 550                     tv2=accum[2][next];
 551
 552                     if (frag_rel_pos(i, vn2, tv1, tv2, aln_param->ktup)) {
 553                         if (i-vn2 == accum[1][next]-accum[2][next]) {
 554                             if (i > accum[1][next]+(aln_param->ktup-1)) {
 555                                 fs = accum[0][next]+aln_param->ktup;
 556                             } else {
 557                                 rmndr = i-accum[1][next];
 558                                 fs = accum[0][next]+rmndr;
 559                             }
 560                             flen=next;
 561                             next=0;
 562                             continue;
 563
 564                         } else {
 565                             if (0 == displ[osptr]) {
 566                                 subt1=aln_param->ktup;
 567                             } else {
 568                                 if (i > accum[1][displ[osptr]]+(aln_param->ktup-1)) {
 569                                     subt1=accum[0][displ[osptr]]+aln_param->ktup;
 570                                 } else {
 571                                     rmndr=i-accum[1][displ[osptr]];
 572                                     subt1=accum[0][displ[osptr]]+rmndr;
 573                                 }
 574                             }
 575                             subt2=accum[0][next] - aln_param->wind_gap + aln_param->ktup;
 576                             if (subt2>subt1) {
 577                                 flen=next;
 578                                 fs=subt2;
 579                             } else {
 580                                 flen=displ[osptr];
 581                                 fs=subt1;
 582                             }
 583                             next=0;
 584                             continue;
 585                         }
 586                     } else {
 587                         next=accum[4][next];
 588                         continue;
 589                     }
 590                 }
 591                 break;
 592             }
 593             /*
 594              * End of Aloop
 595              */
 596
 597             vn2=zzb[vn2];
 598         }
 599     }
 600
 601 free_and_exit:
 602     CKFREE(displ);
 603     CKFREE(slopes);
 604     CKFREE(diag_index);
 605
 606     return;
 607 }
 608 /* end of pair_align */
 609
 610
 611
 612 /**
 613  *
 614  * Will compute ktuple scores and store in tmat
 615  * Following values will be set: tmat[i][j], where
 616  * istart <= i <iend
 617  * and
 618  * jstart <= j < jend
 619  * i.e. zero-offset
 620  * tmat data members have to be preallocated
 621  *
 622  * if ktuple_param_t *aln_param == NULL defaults will be used
 623  */
 624 void
 625 KTuplePairDist(symmatrix_t *tmat, mseq_t *mseq,
 626                int istart, int iend,
 627                int jstart, int jend,
 628                ktuple_param_t *param_override,
 629                progress_t *prProgress,
 630                unsigned long int *ulStepNo, unsigned long int ulTotalStepNo)
 631 {
 632     /* this first group of variables were previously static
 633        and hence un-parallelisable */
 634     char **seq_array;
 635     int maxsf;
 636     int **accum;
 637     int max_aln_length;
 638     /* divide score with length of smallest sequence */
 639     int *zza, *zzb, *zzc, *zzd;
 640     int private_step_no = 0;
 641
 642     int i, j, dsr;
 643     double calc_score;
 644     int max_res_code = -1;
 645
 646     int max_seq_len;
 647     int *seqlen_array;
 648     /* progress_t *prProgress; */
 649     /* int uStepNo, uTotalStepNo; */
 650     ktuple_param_t aln_param = default_protein_param;
 651     bool bPrintCR = (rLog.iLogLevelEnabled<=LOG_VERBOSE) ? FALSE : TRUE;
 652
 653
 654     if(prProgress == NULL) {
 655         NewProgress(&prProgress, LogGetFP(&rLog, LOG_INFO),
 656                     "Ktuple-distance calculation progress", bPrintCR);
 657     }
 658
 659     /* conversion to old style data types follows
 660      *
 661      */
 662
 663     seqlen_array = (int*) CKMALLOC((mseq->nseqs+1) * sizeof(int));
 664     for (i=0; i<mseq->nseqs; i++) {
 665         seqlen_array[i+1] = mseq->sqinfo[i].len;
 666     }
 667
 668     /* setup alignment parameters
 669      */
 670     if (SEQTYPE_PROTEIN == mseq->seqtype) {
 671         DNAFLAG = FALSE;
 672         max_res_code = strlen(AMINO_ACID_CODES)-2;
 673         aln_param = default_protein_param;
 674
 675     } else if (SEQTYPE_RNA == mseq->seqtype || SEQTYPE_DNA == mseq->seqtype) {
 676         DNAFLAG = TRUE;
 677         max_res_code = strlen(NUCLEIC_ACID_CODES)-2;
 678         aln_param = default_dna_param;
 679
 680     } else {
 681         Log(&rLog, LOG_FATAL, "Internal error in %s: Unknown sequence type.", __FUNCTION__);
 682     }
 683
 684     if (NULL!=param_override) {
 685         aln_param.ktup = param_override->ktup;
 686         aln_param.wind_gap = param_override->wind_gap;
 687         aln_param.signif = param_override->signif;
 688         aln_param.window = param_override->window;
 689     }
 690
 691     /*LOG_DEBUG("DNAFLAG = %d max_res_code = %d", DNAFLAG, max_res_code);*/
 692
 693     /* convert mseq to clustal's old-style int encoded sequences (unit-offset)
 694      */
 695     max_aln_length = 0;
 696     max_seq_len = 0;
 697     seq_array =  (char **) CKMALLOC((mseq->nseqs+1) * sizeof(char *));
 698     seq_array[0] = NULL;
 699     /* FIXME check that non of the seqs is smaller than ktup (?).
 700      * Otherwise segfault occurs
 701      */
 702     for (i=0; i<mseq->nseqs; i++) {
 703         seq_array[i+1] = (char *) CKMALLOC((seqlen_array[i+1]+2) * sizeof (char));;
 704     }
 705     for (i=0; i<mseq->nseqs; i++) {
 706         /*LOG_DEBUG("calling encode with seq_array[%d+1] len=%d and seq=%s",
 707           i, seqlen_array[i+1], mseq->seq[i]);*/
 708         if (TRUE == DNAFLAG) {
 709             encode(&(mseq->seq[i][-1]), seq_array[i+1],
 710                    seqlen_array[i+1], NUCLEIC_ACID_CODES);
 711         } else  {
 712             encode(&(mseq->seq[i][-1]), seq_array[i+1],
 713                    seqlen_array[i+1], AMINO_ACID_CODES);
 714         }
 715
 716         if (seqlen_array[i+1]>max_seq_len) {
 717             max_seq_len = seqlen_array[i+1];
 718         }
 719     }
 720     max_aln_length = max_seq_len * 2;
 721     /* see sequence.c in old source */
 722
 723     /* FIXME: short sequences can cause seg-fault
 724      * because max_aln_length can get shorter
 725      * than (max_res_code+1)^k
 726      * FS, r222->r223 */
 727     max_aln_length = max_aln_length > pow((max_res_code+1), aln_param.ktup)+1 ?
 728         max_aln_length : pow((max_res_code+1), aln_param.ktup)+1;
 729
 730     /*
 731      *
 732      * conversion to old style clustal done (in no time) */
 733
 734
 735     accum = (int **) CKCALLOC(5, sizeof (int *));
 736     for (i=0;i<5;i++) {
 737         accum[i] = (int *) CKCALLOC((2*max_aln_length+1), sizeof(int));
 738     }
 739     zza = (int *) CKCALLOC( (max_aln_length+1), sizeof(int));
 740     zzb = (int *) CKCALLOC( (max_aln_length+1), sizeof(int));
 741     zzc = (int *) CKCALLOC( (max_aln_length+1), sizeof(int));
 742     zzd = (int *) CKCALLOC( (max_aln_length+1), sizeof(int));
 743
 744     /* estimation of total number of steps (if istart and jstart are
 745      * both 0) (now handled in the calling routine)
 746      */
 747     /* uTotalStepNo = iend*jend - iend*iend/2 + iend/2;
 748     uStepNo = 0; */
 749     /*LOG_DEBUG("istart=%d iend=%d jstart=%d jend=%d", istart, iend, jstart, jend);*/
 750
 751     for (i=istart+1; i<=iend; ++i) {
 752         /* by definition a sequence compared to itself should give
 753            a score of 0. AW */
 754         SymMatrixSetValue(tmat, i-1, i-1, 0.0);
 755         make_ptrs(zza, zzc, i, seqlen_array[i], aln_param.ktup, max_res_code, seq_array);
 756
 757 #ifdef HAVE_OPENMP
 758         #pragma omp critical(ktuple)
 759 #endif
 760         {
 761             ProgressLog(prProgress, *ulStepNo, ulTotalStepNo, FALSE);
 762         }
 763
 764         for (j=MAX(i+1, jstart+1); j<=jend; ++j) {
 765             (*ulStepNo)++;
 766             private_step_no++;
 767             /*LOG_DEBUG("comparing pair %d:%d", i, j);*/
 768
 769             make_ptrs(zzb, zzd, j, seqlen_array[j], aln_param.ktup, max_res_code, seq_array);
 770             pair_align(i, seqlen_array[i], seqlen_array[j], max_res_code, &aln_param,
 771                 seq_array, &maxsf, accum, max_aln_length, zza, zzb, zzc, zzd);
 772
 773             if (!maxsf) {
 774                 calc_score=0.0;
 775             } else {
 776                 calc_score=(double)accum[0][maxsf];
 777                 if (percent) {
 778                     dsr=(seqlen_array[i]<seqlen_array[j]) ?
 779                         seqlen_array[i] : seqlen_array[j];
 780                     calc_score = (calc_score/(double)dsr) * 100.0;
 781                 }
 782             }
 783
 784             /* printf("%d %d %d\n", i-1, j-1, (100.0 - calc_score)/100.0); */
 785             SymMatrixSetValue(tmat, i-1, j-1, (100.0 - calc_score)/100.0);
 786
 787             /* the function allows you not to compute the full matrix.
 788              * here we explicitely make the resulting matrix a
 789              * rectangle, i.e. we always set full rows. in other
 790              * words, if we don't complete the full matrix then we
 791              * don't have a full symmetry. so only use the defined
 792              * symmetric part. AW
 793              */
 794             /*LOG_DEBUG("setting %d : %d = %f", j, i, tmat[i][j]);*/
 795             /* not needed anymore since we use symmatrix_t
 796                if (j<=iend) {
 797                tmat[j][i] = tmat[i][j];
 798                }
 799             */
 800 #ifdef HAVE_OPENMP
 801             #pragma omp critical(ktuple)
 802 #endif
 803             {
 804                 Log(&rLog, LOG_DEBUG, "K-tuple distance for sequence pair %d:%d = %lg",
 805                     i, j, SymMatrixGetValue(tmat, i-1, j-1));
 806             }
 807         }
 808     }
 809     /*
 810       Log(&rLog, LOG_FORCED_DEBUG, "uTotalStepNo=%d for istart=%d iend=%d jstart=%d jend=%d", uStepNo, istart, iend, jstart, jend);
 811       Log(&rLog, LOG_FORCED_DEBUG, "Fabian = %d", iend*jend - iend*iend/2 + iend/2);
 812     */
 813
 814 /*    printf("\n\n%d\t%d\t%d\t%d\n\n", omp_get_thread_num(), uStepNo, istart, iend); */
 815
 816     for (i=0;i<5;i++) {
 817         CKFREE(accum[i]);
 818     }
 819     CKFREE(accum);
 820
 821 #ifdef HAVE_OPENMP
 822     #pragma omp critical(ktuple)
 823 #if 0
 824     {
 825     printf("steps: %d\n", private_step_no);
 826     }
 827 #endif
 828 #endif
 829
 830     CKFREE(zza);
 831     CKFREE(zzb);
 832     CKFREE(zzc);
 833     CKFREE(zzd);
 834
 835     free(seqlen_array);
 836
 837     for (i=1; i<=mseq->nseqs; i++) {
 838         CKFREE(seq_array[i]);
 839     }
 840     CKFREE(seq_array);
 841 }
 842 /* end of KTuplePairDist */