3 * Copyright (c) 2007 Des Higgins, Julie Thompson and Toby Gibson.
15 #include "../alignment/Alignment.h"
42 /* adopted from Sean Eddy'ssquid:aligneval.c */
43 /* Function: PairwiseIdentity()
45 * Purpose: Calculate the pairwise fractional identity between
46 * two aligned sequences s1 and s2. This is simply
47 * (idents / MIN(len1, len2)).
49 * Note how many ways there are to calculate pairwise identity,
50 * because of the variety of choices for the denominator:
51 * idents/(idents+mismat) has the disadvantage that artifactual
52 * gappy alignments would have high "identities".
53 * idents/(AVG|MAX)(len1,len2) both have the disadvantage that
54 * alignments of fragments to longer sequences would have
55 * artifactually low "identities".
57 * Case sensitive; also, watch out in nucleic acid alignments;
58 * U/T RNA/DNA alignments will be counted as mismatches!
61 PairwiseIdentity(char @s1, char @s2)
63 int idents; /@ total identical positions @/
64 int len1, len2; /@ lengths of seqs @/
65 int x; /@ position in aligned seqs @/
67 idents = len1 = len2 = 0;
68 for (x = 0; s1[x] != '\0' && s2[x] != '\0'; x++)
72 if (s1[x] == s2[x]) idents++;
74 if (!isgap(s2[x])) len2++;
76 if (len2 < len1) len1 = len2;
77 return (len1 == 0 ? 0.0 : (float) idents / (float) len1);
82 /* s1/s2 are unit-offset as usual
86 Stats::pairwiseIdentity(Alignment *alnObj, int s1, int s2)
88 int idents; /* total identical positions */
89 int len1, len2; /* real lengths of seqs */
90 int x; /* position in aligned seqs */
92 const vector<int>* seq1 = alnObj->getSequence(s1);
93 const vector<int>* seq2 = alnObj->getSequence(s2);
95 idents = len1 = len2 = 0;
96 // cerr << "comparing " << alnObj->getName(s1).c_str() << ":" << alnObj->getName(s2).c_str() << " " << s1 << ":" << s2 << "\n";
98 // sequence length should be identical, but be paranoid
99 for (x = 1; x<=alnObj->getSeqLength(s1) && x<=alnObj->getSeqLength(s2); x++)
101 if (! alnObj->isGap(s1, x)) {
103 //cerr << " pos " << x << ": " << (*seq1)[x] << ":" << (*seq2)[x] << "\n";
104 if ((*seq1)[x] == (*seq2)[x])
109 //cerr << " gap at pos " << x << " (" << s1 << ")\n";
111 if (! alnObj->isGap(s2, x))
115 // cerr << " gap at pos " << x << " (" << s2 << ")\n";
119 return (len1 == 0 ? 0.0 : (float) idents / (float) len1);
127 Stats::ConcatInputHash(Alignment *alnObj)
129 vector<string> rawSeqArray;
133 // collect all sequences and sort
134 const clustalw::SeqArray* seqArray = alnObj->getSeqArray();
135 for(int s = 1; s <= alnObj->getNumSeqs(); s++)
138 for(int r = 1; r <= alnObj->getSeqLength(s); r++)
140 int val = (*seqArray)[s][r];
141 seq.append(1, clustalw::userParameters->getAminoAcidCode(val));
143 rawSeqArray.push_back(seq);
145 std::sort(rawSeqArray.begin(), rawSeqArray.end());
147 // concatenate sorted seqs
149 std::vector<string>::iterator iter;
150 for(iter=rawSeqArray.begin(); iter != rawSeqArray.end(); ++iter)
151 concatSeq.append(*iter);
153 // build hash and return
154 hash = Md5Hash(concatSeq.c_str());
157 ret="HASHING_FAILURE";
159 for (int i=0; i<strlen(hash); i++)
160 ret.append(1, hash[i]);
168 Stats::Md5ForSeq(Alignment *alnObj, int s)
171 const clustalw::SeqArray* seqArray = alnObj->getSeqArray();
175 for(int l = 1; l <= alnObj->getSeqLength(s); l++)
177 int val = (*seqArray)[s][l];
179 if((val < 0) || (val > userParameters->getMaxAA()))
181 seq.append(1, clustalw::userParameters->getAminoAcidCode(val));
183 std::transform(seq.begin(), seq.end(), seq.begin(), ::toupper);
185 hash = Md5Hash(seq.c_str());
188 ret = "HASHING_FAILURE";
195 /* create md5 hash from input string
196 * returns NULL on failure
197 * user must free returned string
200 Stats::Md5Hash(const char *thread)
208 td = mhash_init(MHASH_MD5);
209 if (td == MHASH_FAILED)
215 mhash(td, thread, strlen(thread));
216 //mhash_deinit(td, hash);
217 hash = (unsigned char*) mhash_end(td);
219 rethash = (char*) calloc(mhash_get_block_size(MHASH_MD5)*2+1, sizeof(char));
220 for (i = 0; i < mhash_get_block_size(MHASH_MD5); i++) {
221 sprintf(&rethash[i*2], "%.2x", hash[i]);
234 Stats::logCmdLine(int argc, char **argv)
236 FILE *fp = fopen(logfilename.c_str(), "a");
239 cerr << "couldn't open file " << logfilename << " for logging of stats\n";
245 for (int i=1; i<argc; i++)
247 // remove non-interesting stuff
248 if (strstr(argv[i], "-infile=")==NULL &&
249 strstr(argv[i], "-outfile=")==NULL &&
250 strstr(argv[i], "-stats=")==NULL &&
251 strstr(argv[i], "-align")==NULL)
253 fprintf(fp, "cmdline non-default arg: %s\n", argv[i]);
260 /* log some statistics for input sequences, i.e. alnObj should hold
261 * the unaligned sequences
265 Stats::logInputSeqStats(Alignment *alnObj)
268 std::vector<double> lengths;
269 time_t t = time(NULL);
270 tm s = *localtime(&t);
274 FILE *fp = fopen(logfilename.c_str(), "a");
277 cerr << "couldn't open file " << logfilename << " for logging of stats\n";
281 fprintf(fp, "logging job: %s on %s", userParameters->getSeqName().c_str(), asctime(&s));
282 fprintf(fp, "clustal version: %s\n", userParameters->getRevisionLevel().c_str());
284 fprintf(fp, "seq type: ");
285 if (userParameters->getDNAFlag())
288 fprintf(fp, "protein");
291 fprintf(fp, "numseqs: %d\n", alnObj->getNumSeqs());
293 // create a vector of seq lengths for later
294 // and get shortest seq at the same time
295 shortest=alnObj->getLengthLongestSequence();
296 for (i = 1; i <= alnObj->getNumSeqs(); i++) {
297 int l = alnObj->getSeqLength(i);
298 lengths.push_back(l);
303 fprintf(fp, "seqlen longest: %d\n", alnObj->getLengthLongestSequence());
304 fprintf(fp, "seqlen shortest: %d\n", shortest);
306 fprintf(fp, "seqlen avg: %.2f\n", utilityObject->average(lengths));
307 fprintf(fp, "seqlen std-dev: %.2f\n", utilityObject->stdDev(lengths));
308 fprintf(fp, "seqlen median: %.2f\n", utilityObject->median(lengths));
312 //hash = concatInputHash(alnObj);
313 //fprintf(fp, "seq hash: %s\n", hash.c_str());
315 for (int s = 1; s <= alnObj->getNumSeqs(); s++) {
316 string md5 = Md5ForSeq(alnObj, s);
317 fprintf(fp, "md5 for seq %d: %s\n", s, md5.c_str());
321 fprintf(fp, "md5: disabled\n");
329 /* log some statistics for aligned sequences, i.e. alnObj should hold
330 * the already aligned sequences
334 Stats::logAlignedSeqStats(Alignment *alnObj)
336 FILE *fp = fopen(logfilename.c_str(), "a");
339 cerr << "couldn't open file " << logfilename << " for logging of stats\n";
343 // alignment length is the length of any sequence
344 fprintf(fp, "aln len: %d\n", alnObj->getSeqLength(1));
347 std::vector<double> pwIdents;
348 double lowestPwId = 1.0;
349 double hightestPwId = 0.0;
351 // create vector of pairwise identities
352 for(int s1 = 1; s1 <= alnObj->getNumSeqs(); s1++)
354 for(int s2 = s1+1; s2 <= alnObj->getNumSeqs(); s2++)
356 double thisPwId = pairwiseIdentity(alnObj, s1, s2);
357 pwIdents.push_back(thisPwId);
358 if (thisPwId>hightestPwId)
359 hightestPwId=thisPwId;
360 if (thisPwId<lowestPwId)
362 //fprintf(fp, "ident %s:%s %d:%d=%f\n", alnObj->getName(s1).c_str(), alnObj->getName(s2).c_str(), s1, s2, PairwiseIdentity(alnObj, s1, s2));
365 fprintf(fp, "aln pw-id highest: %.2f\n", hightestPwId);
366 fprintf(fp, "aln pw-id lowest: %.2f\n", lowestPwId);
367 fprintf(fp, "aln pw-id avg: %.2f\n", utilityObject->average(pwIdents));
368 fprintf(fp, "aln pw-id std-dev: %.2f\n", utilityObject->stdDev(pwIdents));
369 fprintf(fp, "aln pw-id median: %.2f\n", utilityObject->median(pwIdents));