1 /* -*- mode: c; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
3 /*********************************************************************
4 * Clustal Omega - Multiple sequence alignment
6 * Copyright (C) 2010 University College Dublin
8 * Clustal-Omega is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License as
10 * published by the Free Software Foundation; either version 2 of the
11 * License, or (at your option) any later version.
13 * This file is part of Clustal-Omega.
15 ********************************************************************/
18 * RCS $Id: pair_dist.c 242 2011-05-27 14:04:21Z andreas $
30 /* only neededfor iNumberOfThreads */
31 #include "clustal-omega.h"
33 #include "ktuple_pair.h"
34 #include "pair_dist.h"
39 /* Up to rev 173 we had a USE_SYM_KTUPLE switch implemented here. When active
40 * ktuple distances were computed twice for each pair and averaged. Idea was
41 * to avoid assymmetries in the pairwise scores (score(a, b) is often not the
42 * same as score(b, a)). Results on BAliBASE indicate that this is overkill:
44 * r92_default core columns: avg-sp=0.800656 avg-tc=0.47711 (of total 218)
45 * r93-mod--norm-ktuple/ core columns: avg-sp=0.800656 avg-tc=0.47711 (of total 218)
46 * r93-mod--sym-ktuple/ core columns: avg-sp=0.801083 avg-tc=0.476544 (of total 217)
47 * r93-mod--rand-ktuple-1 core columns: avg-sp=0.799289 avg-tc=0.468028 (of total 218)
48 * r93-mod--rand-ktuple-2 core columns: avg-sp=0.801654 avg-tc=0.47659 (of total 217)
49 * r93-mod--rand-ktuple-3 core columns: avg-sp=0.800234 avg-tc=0.474908 (of total 218)
50 * r93-mod--rand-ktuple-4 core columns: avg-sp=0.800573 avg-tc=0.476514 (of total 218)
51 * r93-mod--rand-ktuple-5 core columns: avg-sp=0.799679 avg-tc=0.468716 (of total 218)
56 KimuraCorrection(double frac_id);
59 SquidIdPairDist(symmatrix_t *tmat, mseq_t *mseq,
62 bool use_KimuraCorrection, progress_t *prProgress,
63 unsigned long int *ulStepNo, unsigned long int ulTotalStepNo);
65 /* Taken from Muscle's msadistkimura.cpp */
66 static int DAYHOFF_PAMS[]={
67 195, /* 75.0% observed d; 195 PAMs estimated = 195% estimated d */
68 196, /* 75.1% observed d; 196 PAMs estimated */
69 197, 198, 199, 200, 200, 201, 202, 203,
70 204, 205, 206, 207, 208, 209, 209, 210, 211, 212,
71 213, 214, 215, 216, 217, 218, 219, 220, 221, 222,
72 223, 224, 226, 227, 228, 229, 230, 231, 232, 233,
73 234, 236, 237, 238, 239, 240, 241, 243, 244, 245,
74 246, 248, 249, 250, /* 250 PAMs = 80.3% observed d */
75 252, 253, 254, 255, 257, 258,
76 260, 261, 262, 264, 265, 267, 268, 270, 271, 273,
77 274, 276, 277, 279, 281, 282, 284, 285, 287, 289,
78 291, 292, 294, 296, 298, 299, 301, 303, 305, 307,
79 309, 311, 313, 315, 317, 319, 321, 323, 325, 328,
80 330, 332, 335, 337, 339, 342, 344, 347, 349, 352,
81 354, 357, 360, 362, 365, 368, 371, 374, 377, 380,
82 383, 386, 389, 393, 396, 399, 403, 407, 410, 414,
83 418, 422, 426, 430, 434, 438, 442, 447, 451, 456,
84 461, 466, 471, 476, 482, 487, 493, 498, 504, 511,
85 517, 524, 531, 538, 545, 553, 560, 569, 577, 586,
86 595, 605, 615, 626, 637, 649, 661, 675, 688, 703,
87 719, 736, 754, 775, 796, 819, 845, 874, 907, 945,
88 /* 92.9% observed; 945 PAMs */
89 988 /* 93.0% observed; 988 PAMs */
91 static int DAYHOFF_TABLE_ENTRIES = sizeof(DAYHOFF_PAMS)/sizeof(DAYHOFF_PAMS[0]);
97 * @brief Compute Kimura corrected distance.
99 * Original Muscle documentation following:
101 * This is defined to be:
102 * log_e(1 - p - p*p/5)
103 * where p is the fraction of residues that differ, i.e.:
104 * p = (1 - fractional_conservation)
105 * This measure is infinite for p = 0.8541 and is considered
106 * unreliable for p >= 0.75 (according to the ClustalW docs).
107 * ClustalW uses a table lookup for values > 0.75. The following table
108 * was copied from the ClustalW file dayhoff.h.
111 * @note copied from Muscle's msadistkimura.cpp:KimuraDist()
113 * @warning For protein only (uses Dayhoff substitution parameters)
116 * distance, e.g. 1.0 - fractional/relative identity
118 * @return The Kimura corrected distance
122 KimuraCorrection(double p)
126 /* Typical case: use Kimura's empirical formula */
128 return -log(1 - p - (p*p)/5);
130 /* Per ClustalW, return 10.0 for anything over 93% */
134 /* If 0.75 >= p <= 0.93, use table lookup */
135 table_index = (int) ((p - 0.75)*1000 + 0.5);
136 if (table_index < 0 || table_index >= DAYHOFF_TABLE_ENTRIES)
137 Log(&rLog, LOG_FATAL, "Internal error in %s:%s", __FILE__, __FUNCTION__);
139 return DAYHOFF_PAMS[table_index] / 100.0;
141 /*** end: KimuraCorrection() ***/
147 * @brief Compute distances between all aligned sequence pairs using
148 * squid's PairwiseIdentity, which is: idents / MIN(len1, len2)
151 * Where to store the computed distances
153 * The aligned sequences
155 * For distances [i][j] i>=istart, i<j
157 * For distances [i][j] i<iend, i<j
159 * For distances [i][j] j>=jstart, i<j
161 * For distances [i][j] i<j<jend, i<j
162 * @param[in] use_kimura
163 * Use Kimura corrected values (Proteins only)
165 * @return Non-zero on error
169 SquidIdPairDist(symmatrix_t *tmat, mseq_t *mseq,
170 int istart, int iend,
171 int jstart, int jend,
172 bool use_kimura, progress_t *prProgress,
173 unsigned long int *ulStepNo, unsigned long int ulTotalStepNo)
176 /* progress_t *prProgress; */
177 bool bPrintCR = (rLog.iLogLevelEnabled<=LOG_VERBOSE) ? FALSE : TRUE;
178 /* unsigned long int ulStepNo;
179 unsigned long ulTotalStepNo; */
181 assert(NULL != tmat);
182 assert(NULL != mseq);
184 if (TRUE != mseq->aligned) {
185 Log(&rLog, LOG_ERROR, "Sequences need to be aligned (%s)", __FUNCTION__);
188 if (SEQTYPE_PROTEIN != mseq->seqtype && TRUE == use_kimura) {
189 Log(&rLog, LOG_WARN, "Using Kimura distance corretion which includes Dayhoff substitution table lookup for non-protein sequences");
192 NewProgress(&prProgress, LogGetFP(&rLog, LOG_INFO),
193 "Pairwise distance calculation progress", bPrintCR);
194 /* estimation of total number of steps (if istart and jstart are
197 /* ulTotalStepNo = iend*jend - iend*iend/2 + iend/2;
199 /*LOG_DEBUG("istart=%d iend=%d jstart=%d jend=%d", istart, iend, jstart, jend);*/
200 for (i=istart; i<iend; ++i) {
201 /* by definition a sequence compared to itself should give a
203 SymMatrixSetValue(tmat, i, i, 0.0);
205 #pragma omp critical(squidid)
208 ProgressLog(prProgress, *ulStepNo, ulTotalStepNo, FALSE);
211 for (j=MAX(i+1, jstart); j<jend; ++j) {
213 dist = 1.0 - PairwiseIdentity(mseq->seq[i], mseq->seq[j]);
220 /*LOG_DEBUG("%d:%d raw dist = %f", i, j, dist);*/
222 dist = KimuraCorrection(dist);
223 /*LOG_DEBUG("cor dist = %f", dist);*/
225 SymMatrixSetValue(tmat, i, j, dist);
227 #pragma omp critical(squidid)
230 Log(&rLog, LOG_DEBUG, "Aligned distance for sequence pair %d:%d= %lg",
238 /*** end: SquidIdPairDist() ***/
244 * @brief compute or read precomputed distances for given sequences
246 * @param[out] distmat
247 * Distances will be written to this matrix. will be allocated here as
248 * well. Caller must free with FreeSymMatrix()
250 * Distances will be computed for these sequences
251 * @param[in] pairdist_type
252 * Type of pairwise distance comparison
253 * @param[in] fdist_in
254 * If not NULL, sequences will be written from this file instead of
257 * Compute distances for sequences i:j, i>=istart, i<j.
260 * Compute distances for sequences i:j, i<iend, i<j
261 * Usually mseq->nseqs.
263 * Compute distances for sequences i:j, j>=jstart, i<j
266 * Compute distances for sequences i:j, j<iend, i<j
267 * Usually mseq->nseqs.
268 * @param[in] fdist_out
269 * If not NULL, distances will be written to this files
274 PairDistances(symmatrix_t **distmat, mseq_t *mseq, int pairdist_type,
275 int istart, int iend,
276 int jstart, int jend,
277 char *fdist_in, char *fdist_out)
280 unsigned long int ulStepNo = 0, ulTotalStepNo; /* DD: moved from SquidIdPairDist so progress bar works multithreaded */
281 int iChunk, iChunkStart, iChunkEnd;
282 int iChunkStarts[iNumberOfThreads];
283 int iChunkEnds[iNumberOfThreads];
284 progress_t *prProgress = NULL;
285 int iSquidSuccess = 0;
286 bool bPrintCR = (rLog.iLogLevelEnabled<=LOG_VERBOSE) ? FALSE : TRUE;
288 assert(NULL!=distmat);
293 if (NewSymMatrix(distmat, iend, jend)!=0) {
294 Log(&rLog, LOG_FATAL, "%s", "Memory allocation for distance matrix failed");
298 /* compute pairwise distances or read from file
302 #include "random-dist.h"
304 if (NULL != fdist_in) {
305 Log(&rLog, LOG_FATAL, "FIXME: reading of distance matrix from file not implemented");
309 /* break into chunks, one for each thread
310 matrix is a triangle, not a square
311 hence making even chunk sizes is slightly fiddlier
313 ulTotalStepNo = iend*jend - iend*iend/2 + iend/2;
315 /* FIXME: can get rid of iChunkStart, iChunkEnd now that we're using the arrays */
317 for(iChunk = 0; iChunk <= iNumberOfThreads; iChunk++)
319 iChunkEnd = iChunkStart;
320 if(iChunk == iNumberOfThreads - 1)
323 iChunkStart = iend - ((double)(iend - istart) * sqrt(((double)iChunk + 1.0)/(double)iNumberOfThreads));
324 iChunkStarts[iChunk] = iChunkStart;
325 iChunkEnds[iChunk] = iChunkEnd;
328 if (PAIRDIST_KTUPLE == pairdist_type) {
330 Log(&rLog, LOG_INFO, "Calculating pairwise ktuple-distances...");
332 NewProgress(&prProgress, LogGetFP(&rLog, LOG_INFO),
333 "Ktuple-distance calculation progress", bPrintCR);
335 #pragma omp parallel for private(iChunk) schedule(dynamic)
337 for(iChunk = 0; iChunk < iNumberOfThreads; iChunk++)
339 KTuplePairDist((*distmat), mseq, iChunkStarts[iChunk],
340 iChunkEnds[iChunk], jstart, jend, NULL, prProgress,
341 &ulStepNo, ulTotalStepNo);
345 printf("total ops %d\n", ulStepNo);
348 KTuplePairDist((*distmat), mseq,
350 jstart, jend, NULL); */
352 } else if (PAIRDIST_SQUIDID == pairdist_type) {
353 Log(&rLog, LOG_INFO, "Calculating pairwise aligned identity distances...");
355 NewProgress(&prProgress, LogGetFP(&rLog, LOG_INFO),
356 "Pairwise identity calculation progress", bPrintCR);
358 #pragma omp parallel for private(iChunk) schedule(dynamic)
360 for(iChunk = 0; iChunk < iNumberOfThreads; iChunk++)
362 iSquidSuccess = SquidIdPairDist((*distmat), mseq,
363 iChunkStarts[iChunk], iChunkEnds[iChunk],
364 jstart, jend, FALSE, prProgress,
365 &ulStepNo, ulTotalStepNo);
367 if(iSquidSuccess != 0)
370 } else if (PAIRDIST_SQUIDID_KIMURA == pairdist_type) {
371 Log(&rLog, LOG_INFO, "Calculating Kimura-corrected pairwise aligned identity distances...");
372 NewProgress(&prProgress, LogGetFP(&rLog, LOG_INFO),
373 "Pairwise identity calculation progress", bPrintCR);
375 #pragma omp parallel for private(iChunk) schedule(dynamic)
377 for(iChunk = 0; iChunk < iNumberOfThreads; iChunk++)
379 iSquidSuccess = SquidIdPairDist((*distmat), mseq,
380 iChunkStarts[iChunk], iChunkEnds[iChunk],
381 jstart, jend, TRUE, prProgress,
382 &ulStepNo, ulTotalStepNo);
384 if(iSquidSuccess != 0)
387 Log(&rLog, LOG_FATAL, "INTERNAL ERROR: don't know about pairdist_type %d",
391 #endif /* random/proper distance calculation */
394 /* optional printing of matrix to file
396 if (NULL != fdist_out) {
397 /* need a copy of sequence names for printing */
399 names = (char **)CKMALLOC(mseq->nseqs * sizeof(char*));
400 for (uSeqIndex=0; uSeqIndex<mseq->nseqs; uSeqIndex++) {
401 names[uSeqIndex] = mseq->sqinfo[uSeqIndex].name;
404 SymMatrixPrint((*distmat), names, fdist_out);
406 Log(&rLog, LOG_INFO, "Pairwise distance matrix written to %s",
412 #include "distance-distrib.h"
415 if (NULL != prProgress) {
416 ProgressDone(prProgress);
417 FreeProgress(&prProgress);
422 /*** end: PairDistances() ***/