1 /* -*- mode: c; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
3 /*********************************************************************
4 * Clustal Omega - Multiple sequence alignment
6 * Copyright (C) 2010 University College Dublin
8 * Clustal-Omega is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License as
10 * published by the Free Software Foundation; either version 2 of the
11 * License, or (at your option) any later version.
13 * This file is part of Clustal-Omega.
15 ********************************************************************/
18 * RCS $Id: pair_dist.c 301 2016-06-13 13:32:55Z fabian $
30 /* only neededfor iNumberOfThreads */
31 #include "clustal-omega.h"
33 #include "ktuple_pair.h"
34 #include "pair_dist.h"
38 /* Made iend/jend const unsigned long int (originally just int), FS, 2016-04-04
42 /* Up to rev 173 we had a USE_SYM_KTUPLE switch implemented here. When active
43 * ktuple distances were computed twice for each pair and averaged. Idea was
44 * to avoid assymmetries in the pairwise scores (score(a, b) is often not the
45 * same as score(b, a)). Results on BAliBASE indicate that this is overkill:
47 * r92_default core columns: avg-sp=0.800656 avg-tc=0.47711 (of total 218)
48 * r93-mod--norm-ktuple/ core columns: avg-sp=0.800656 avg-tc=0.47711 (of total 218)
49 * r93-mod--sym-ktuple/ core columns: avg-sp=0.801083 avg-tc=0.476544 (of total 217)
50 * r93-mod--rand-ktuple-1 core columns: avg-sp=0.799289 avg-tc=0.468028 (of total 218)
51 * r93-mod--rand-ktuple-2 core columns: avg-sp=0.801654 avg-tc=0.47659 (of total 217)
52 * r93-mod--rand-ktuple-3 core columns: avg-sp=0.800234 avg-tc=0.474908 (of total 218)
53 * r93-mod--rand-ktuple-4 core columns: avg-sp=0.800573 avg-tc=0.476514 (of total 218)
54 * r93-mod--rand-ktuple-5 core columns: avg-sp=0.799679 avg-tc=0.468716 (of total 218)
59 KimuraCorrection(double frac_id);
62 SquidIdPairDist(symmatrix_t *tmat, mseq_t *mseq,
63 int istart, const unsigned long int iend,
64 int jstart, const unsigned long int jend,
65 bool use_KimuraCorrection, progress_t *prProgress,
66 unsigned long int *ulStepNo, unsigned long int ulTotalStepNo);
68 /* Taken from Muscle's msadistkimura.cpp */
69 static int DAYHOFF_PAMS[]={
70 195, /* 75.0% observed d; 195 PAMs estimated = 195% estimated d */
71 196, /* 75.1% observed d; 196 PAMs estimated */
72 197, 198, 199, 200, 200, 201, 202, 203,
73 204, 205, 206, 207, 208, 209, 209, 210, 211, 212,
74 213, 214, 215, 216, 217, 218, 219, 220, 221, 222,
75 223, 224, 226, 227, 228, 229, 230, 231, 232, 233,
76 234, 236, 237, 238, 239, 240, 241, 243, 244, 245,
77 246, 248, 249, 250, /* 250 PAMs = 80.3% observed d */
78 252, 253, 254, 255, 257, 258,
79 260, 261, 262, 264, 265, 267, 268, 270, 271, 273,
80 274, 276, 277, 279, 281, 282, 284, 285, 287, 289,
81 291, 292, 294, 296, 298, 299, 301, 303, 305, 307,
82 309, 311, 313, 315, 317, 319, 321, 323, 325, 328,
83 330, 332, 335, 337, 339, 342, 344, 347, 349, 352,
84 354, 357, 360, 362, 365, 368, 371, 374, 377, 380,
85 383, 386, 389, 393, 396, 399, 403, 407, 410, 414,
86 418, 422, 426, 430, 434, 438, 442, 447, 451, 456,
87 461, 466, 471, 476, 482, 487, 493, 498, 504, 511,
88 517, 524, 531, 538, 545, 553, 560, 569, 577, 586,
89 595, 605, 615, 626, 637, 649, 661, 675, 688, 703,
90 719, 736, 754, 775, 796, 819, 845, 874, 907, 945,
91 /* 92.9% observed; 945 PAMs */
92 988 /* 93.0% observed; 988 PAMs */
94 static int DAYHOFF_TABLE_ENTRIES = sizeof(DAYHOFF_PAMS)/sizeof(DAYHOFF_PAMS[0]);
100 * @brief Compute Kimura corrected distance.
102 * Original Muscle documentation following:
104 * This is defined to be:
105 * log_e(1 - p - p*p/5)
106 * where p is the fraction of residues that differ, i.e.:
107 * p = (1 - fractional_conservation)
108 * This measure is infinite for p = 0.8541 and is considered
109 * unreliable for p >= 0.75 (according to the ClustalW docs).
110 * ClustalW uses a table lookup for values > 0.75. The following table
111 * was copied from the ClustalW file dayhoff.h.
114 * @note copied from Muscle's msadistkimura.cpp:KimuraDist()
116 * @warning For protein only (uses Dayhoff substitution parameters)
119 * distance, e.g. 1.0 - fractional/relative identity
121 * @return The Kimura corrected distance
125 KimuraCorrection(double p)
129 /* Typical case: use Kimura's empirical formula */
131 return -log(1 - p - (p*p)/5);
133 /* Per ClustalW, return 10.0 for anything over 93% */
137 /* If 0.75 >= p <= 0.93, use table lookup */
138 table_index = (int) ((p - 0.75)*1000 + 0.5);
139 if (table_index < 0 || table_index >= DAYHOFF_TABLE_ENTRIES)
140 Log(&rLog, LOG_FATAL, "Internal error in %s:%s", __FILE__, __FUNCTION__);
142 return DAYHOFF_PAMS[table_index] / 100.0;
144 /*** end: KimuraCorrection() ***/
150 * @brief Compute distances between all aligned sequence pairs using
151 * squid's PairwiseIdentity, which is: idents / MIN(len1, len2)
154 * Where to store the computed distances
156 * The aligned sequences
158 * For distances [i][j] i>=istart, i<j
160 * For distances [i][j] i<iend, i<j
162 * For distances [i][j] j>=jstart, i<j
164 * For distances [i][j] i<j<jend, i<j
165 * @param[in] use_kimura
166 * Use Kimura corrected values (Proteins only)
168 * @return Non-zero on error
172 SquidIdPairDist(symmatrix_t *tmat, mseq_t *mseq,
173 int istart, const unsigned long int iend,
174 int jstart, const unsigned long int jend,
175 bool use_kimura, progress_t *prProgress,
176 unsigned long int *ulStepNo, unsigned long int ulTotalStepNo)
179 /* progress_t *prProgress; */
180 bool bPrintCR = (rLog.iLogLevelEnabled<=LOG_VERBOSE) ? FALSE : TRUE;
181 /* unsigned long int ulStepNo;
182 unsigned long ulTotalStepNo; */
184 assert(NULL != tmat);
185 assert(NULL != mseq);
187 if (TRUE != mseq->aligned) {
188 Log(&rLog, LOG_ERROR, "Sequences need to be aligned (%s)", __FUNCTION__);
191 if (SEQTYPE_PROTEIN != mseq->seqtype && TRUE == use_kimura) {
192 Log(&rLog, LOG_WARN, "Using Kimura distance corretion which includes Dayhoff substitution table lookup for non-protein sequences");
195 NewProgress(&prProgress, LogGetFP(&rLog, LOG_INFO),
196 "Pairwise distance calculation progress", bPrintCR);
197 /* estimation of total number of steps (if istart and jstart are
200 /* ulTotalStepNo = iend*jend - iend*iend/2 + iend/2;
202 /*LOG_DEBUG("istart=%d iend=%d jstart=%d jend=%d", istart, iend, jstart, jend);*/
203 for (i=istart; i<iend; ++i) {
204 /* by definition a sequence compared to itself should give a
206 SymMatrixSetValue(tmat, i, i, 0.0);
208 #pragma omp critical(squidid)
211 ProgressLog(prProgress, *ulStepNo, ulTotalStepNo, FALSE);
214 for (j=MAX(i+1, jstart); j<jend; ++j) {
216 dist = 1.0 - PairwiseIdentity(mseq->seq[i], mseq->seq[j]);
223 /*LOG_DEBUG("%d:%d raw dist = %f", i, j, dist);*/
225 dist = KimuraCorrection(dist);
226 /*LOG_DEBUG("cor dist = %f", dist);*/
228 SymMatrixSetValue(tmat, i, j, dist);
230 #pragma omp critical(squidid)
233 Log(&rLog, LOG_DEBUG, "Aligned distance for sequence pair %d:%d= %lg",
241 /*** end: SquidIdPairDist() ***/
247 * @brief compute or read precomputed distances for given sequences
249 * @param[out] distmat
250 * Distances will be written to this matrix. will be allocated here as
251 * well. Caller must free with FreeSymMatrix()
253 * Distances will be computed for these sequences
254 * @param[in] pairdist_type
255 * Type of pairwise distance comparison
256 * @param[in] fdist_in
257 * If not NULL, sequences will be written from this file instead of
260 * Compute distances for sequences i:j, i>=istart, i<j.
263 * Compute distances for sequences i:j, i<iend, i<j
264 * Usually mseq->nseqs.
266 * Compute distances for sequences i:j, j>=jstart, i<j
269 * Compute distances for sequences i:j, j<iend, i<j
270 * Usually mseq->nseqs.
271 * @param[in] fdist_out
272 * If not NULL, distances will be written to this files
277 PairDistances(symmatrix_t **distmat, mseq_t *mseq, int pairdist_type, bool bPercID,
278 int istart, const unsigned long int iend,
279 int jstart, const unsigned long int jend,
280 char *fdist_in, char *fdist_out)
283 unsigned long int ulStepNo = 0, ulTotalStepNo; /* DD: moved from SquidIdPairDist so progress bar works multithreaded */
284 int iChunk, iChunkStart, iChunkEnd;
285 int iChunkStarts[iNumberOfThreads];
286 int iChunkEnds[iNumberOfThreads];
287 progress_t *prProgress = NULL;
288 int iSquidSuccess = 0;
289 bool bPrintCR = (rLog.iLogLevelEnabled<=LOG_VERBOSE) ? FALSE : TRUE;
291 assert(NULL!=distmat);
297 /* compute pairwise distances or read from file
301 #include "random-dist.h"
303 if (NULL != fdist_in) {
305 "Please use distance matrix input only, if you know exactly what you're doing!");
306 if (SymMatrixRead(fdist_in, distmat, mseq)) {
307 Log(&rLog, LOG_FATAL, "%s", "Reading distance matrix failed");
312 if (NewSymMatrix(distmat, iend, jend)!=0) {
313 Log(&rLog, LOG_FATAL, "%s", "Memory allocation for distance matrix failed");
316 /* break into chunks, one for each thread
317 matrix is a triangle, not a square
318 hence making even chunk sizes is slightly fiddlier
320 ulTotalStepNo = iend*jend - iend*iend/2 + iend/2;
322 /* FIXME: can get rid of iChunkStart, iChunkEnd now that we're using the arrays */
324 for(iChunk = 0; iChunk <= iNumberOfThreads; iChunk++)
326 iChunkEnd = iChunkStart;
327 if (iChunk == iNumberOfThreads - 1){
330 else if (iend == jend){
331 iChunkStart = iend - ((double)(iend - istart) * sqrt(((double)iChunk + 1.0)/(double)iNumberOfThreads));
334 iChunkStart = iend - (iend - istart) * (iChunk + 1) / (double)(iNumberOfThreads);
336 iChunkStarts[iChunk] = iChunkStart;
337 iChunkEnds[iChunk] = iChunkEnd;
338 /*printf("%s:%d: C=%d, ie=%d, is=%d, je=%d, js=%d, Cstart=%d, Cend=%d, diff=%d\n",
339 __FILE__, __LINE__, iChunk, iend, istart, jend, jstart, iChunkStart, iChunkEnd, iChunkEnd-iChunkStart);*/
342 if (PAIRDIST_KTUPLE == pairdist_type) {
344 Log(&rLog, LOG_INFO, "Calculating pairwise ktuple-distances...");
346 NewProgress(&prProgress, LogGetFP(&rLog, LOG_INFO),
347 "Ktuple-distance calculation progress", bPrintCR);
349 #pragma omp parallel for private(iChunk) schedule(dynamic)
351 for(iChunk = 0; iChunk < iNumberOfThreads; iChunk++)
353 KTuplePairDist((*distmat), mseq, iChunkStarts[iChunk],
354 iChunkEnds[iChunk], jstart, jend, NULL, prProgress,
355 &ulStepNo, ulTotalStepNo);
359 printf("total ops %d\n", ulStepNo);
362 KTuplePairDist((*distmat), mseq,
364 jstart, jend, NULL); */
366 } else if (PAIRDIST_SQUIDID == pairdist_type) {
367 Log(&rLog, LOG_INFO, "Calculating pairwise aligned identity distances...");
369 NewProgress(&prProgress, LogGetFP(&rLog, LOG_INFO),
370 "Pairwise identity calculation progress", bPrintCR);
372 #pragma omp parallel for private(iChunk) schedule(dynamic)
374 for(iChunk = 0; iChunk < iNumberOfThreads; iChunk++)
376 iSquidSuccess = SquidIdPairDist((*distmat), mseq,
377 iChunkStarts[iChunk], iChunkEnds[iChunk],
378 jstart, jend, FALSE, prProgress,
379 &ulStepNo, ulTotalStepNo);
381 if(iSquidSuccess != 0)
384 } else if (PAIRDIST_SQUIDID_KIMURA == pairdist_type) {
385 Log(&rLog, LOG_INFO, "Calculating Kimura-corrected pairwise aligned identity distances...");
386 NewProgress(&prProgress, LogGetFP(&rLog, LOG_INFO),
387 "Pairwise identity calculation progress", bPrintCR);
389 #pragma omp parallel for private(iChunk) schedule(dynamic)
391 for(iChunk = 0; iChunk < iNumberOfThreads; iChunk++)
393 iSquidSuccess = SquidIdPairDist((*distmat), mseq,
394 iChunkStarts[iChunk], iChunkEnds[iChunk],
395 jstart, jend, TRUE, prProgress,
396 &ulStepNo, ulTotalStepNo);
398 if(iSquidSuccess != 0)
401 Log(&rLog, LOG_FATAL, "INTERNAL ERROR: don't know about pairdist_type %d",
405 #endif /* random/proper distance calculation */
408 /* optional printing of matrix to file
410 if (NULL != fdist_out) {
411 /* need a copy of sequence names for printing */
413 names = (char **)CKMALLOC(mseq->nseqs * sizeof(char*));
414 for (uSeqIndex=0; uSeqIndex<mseq->nseqs; uSeqIndex++) {
415 names[uSeqIndex] = mseq->sqinfo[uSeqIndex].name;
418 SymMatrixPrint((*distmat), names, fdist_out, bPercID);
420 Log(&rLog, LOG_INFO, "Pairwise distance matrix written to %s",
426 #include "distance-distrib.h"
429 if (NULL != prProgress) {
430 ProgressDone(prProgress);
431 FreeProgress(&prProgress);
436 /*** end: PairDistances() ***/