/** * * Copyright (c) 2007 Des Higgins, Julie Thompson and Toby Gibson. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #include #include #include #include "../alignment/Alignment.h" #ifdef HAVE_MHASH_H #include "mhash.h" #endif #include "Stats.h" using namespace std; namespace clustalw { Stats::Stats() { enabled = false; } Stats::~Stats() { } /* adopted from Sean Eddy'ssquid:aligneval.c */ /* Function: PairwiseIdentity() * * Purpose: Calculate the pairwise fractional identity between * two aligned sequences s1 and s2. This is simply * (idents / MIN(len1, len2)). * * Note how many ways there are to calculate pairwise identity, * because of the variety of choices for the denominator: * idents/(idents+mismat) has the disadvantage that artifactual * gappy alignments would have high "identities". * idents/(AVG|MAX)(len1,len2) both have the disadvantage that * alignments of fragments to longer sequences would have * artifactually low "identities". * * Case sensitive; also, watch out in nucleic acid alignments; * U/T RNA/DNA alignments will be counted as mismatches! */ /* float PairwiseIdentity(char @s1, char @s2) { int idents; /@ total identical positions @/ int len1, len2; /@ lengths of seqs @/ int x; /@ position in aligned seqs @/ idents = len1 = len2 = 0; for (x = 0; s1[x] != '\0' && s2[x] != '\0'; x++) { if (!isgap(s1[x])) { len1++; if (s1[x] == s2[x]) idents++; } if (!isgap(s2[x])) len2++; } if (len2 < len1) len1 = len2; return (len1 == 0 ? 0.0 : (float) idents / (float) len1); } */ /* s1/s2 are unit-offset as usual * */ float Stats::pairwiseIdentity(Alignment *alnObj, int s1, int s2) { int idents; /* total identical positions */ int len1, len2; /* real lengths of seqs */ int x; /* position in aligned seqs */ const vector* seq1 = alnObj->getSequence(s1); const vector* seq2 = alnObj->getSequence(s2); idents = len1 = len2 = 0; // cerr << "comparing " << alnObj->getName(s1).c_str() << ":" << alnObj->getName(s2).c_str() << " " << s1 << ":" << s2 << "\n"; // sequence length should be identical, but be paranoid for (x = 1; x<=alnObj->getSeqLength(s1) && x<=alnObj->getSeqLength(s2); x++) { if (! alnObj->isGap(s1, x)) { len1++; //cerr << " pos " << x << ": " << (*seq1)[x] << ":" << (*seq2)[x] << "\n"; if ((*seq1)[x] == (*seq2)[x]) idents++; } //DEBUG //else { //cerr << " gap at pos " << x << " (" << s1 << ")\n"; //} if (! alnObj->isGap(s2, x)) len2++; //DEBUG //else // cerr << " gap at pos " << x << " (" << s2 << ")\n"; } if (len2 < len1) len1 = len2; return (len1 == 0 ? 0.0 : (float) idents / (float) len1); } #ifdef HAVE_MHASH_H string Stats::ConcatInputHash(Alignment *alnObj) { vector rawSeqArray; string ret; char *hash; // collect all sequences and sort const clustalw::SeqArray* seqArray = alnObj->getSeqArray(); for(int s = 1; s <= alnObj->getNumSeqs(); s++) { string seq; for(int r = 1; r <= alnObj->getSeqLength(s); r++) { int val = (*seqArray)[s][r]; seq.append(1, clustalw::userParameters->getAminoAcidCode(val)); } rawSeqArray.push_back(seq); } std::sort(rawSeqArray.begin(), rawSeqArray.end()); // concatenate sorted seqs string concatSeq; std::vector::iterator iter; for(iter=rawSeqArray.begin(); iter != rawSeqArray.end(); ++iter) concatSeq.append(*iter); // build hash and return hash = Md5Hash(concatSeq.c_str()); if (hash==NULL) { ret="HASHING_FAILURE"; } else { for (int i=0; igetSeqArray(); char *hash; string ret; for(int l = 1; l <= alnObj->getSeqLength(s); l++) { int val = (*seqArray)[s][l]; // continue if gap if((val < 0) || (val > userParameters->getMaxAA())) continue; seq.append(1, clustalw::userParameters->getAminoAcidCode(val)); } std::transform(seq.begin(), seq.end(), seq.begin(), ::toupper); hash = Md5Hash(seq.c_str()); if (hash==NULL) { ret = "HASHING_FAILURE"; } else { ret = hash; } return ret; } /* create md5 hash from input string * returns NULL on failure * user must free returned string */ char * Stats::Md5Hash(const char *thread) { MHASH td; char *tmpcstr; int i; unsigned char *hash; char *rethash; td = mhash_init(MHASH_MD5); if (td == MHASH_FAILED) return NULL; if (thread==NULL) return NULL; mhash(td, thread, strlen(thread)); //mhash_deinit(td, hash); hash = (unsigned char*) mhash_end(td); rethash = (char*) calloc(mhash_get_block_size(MHASH_MD5)*2+1, sizeof(char)); for (i = 0; i < mhash_get_block_size(MHASH_MD5); i++) { sprintf(&rethash[i*2], "%.2x", hash[i]); } return rethash; } #endif // HAVE_MHASH_H void Stats::logCmdLine(int argc, char **argv) { FILE *fp = fopen(logfilename.c_str(), "a"); if (fp == NULL) { cerr << "couldn't open file " << logfilename << " for logging of stats\n"; return; } if (argc > 1) { for (int i=1; i lengths; time_t t = time(NULL); tm s = *localtime(&t); int shortest; string hash; FILE *fp = fopen(logfilename.c_str(), "a"); if (fp == NULL) { cerr << "couldn't open file " << logfilename << " for logging of stats\n"; return; } fprintf(fp, "logging job: %s on %s", userParameters->getSeqName().c_str(), asctime(&s)); fprintf(fp, "clustal version: %s\n", userParameters->getRevisionLevel().c_str()); fprintf(fp, "seq type: "); if (userParameters->getDNAFlag()) fprintf(fp, "DNA"); else fprintf(fp, "protein"); fprintf(fp, "\n"); fprintf(fp, "numseqs: %d\n", alnObj->getNumSeqs()); // create a vector of seq lengths for later // and get shortest seq at the same time shortest=alnObj->getLengthLongestSequence(); for (i = 1; i <= alnObj->getNumSeqs(); i++) { int l = alnObj->getSeqLength(i); lengths.push_back(l); if (lgetLengthLongestSequence()); fprintf(fp, "seqlen shortest: %d\n", shortest); fprintf(fp, "seqlen avg: %.2f\n", utilityObject->average(lengths)); fprintf(fp, "seqlen std-dev: %.2f\n", utilityObject->stdDev(lengths)); fprintf(fp, "seqlen median: %.2f\n", utilityObject->median(lengths)); #ifdef HAVE_MHASH_H //hash = concatInputHash(alnObj); //fprintf(fp, "seq hash: %s\n", hash.c_str()); for (int s = 1; s <= alnObj->getNumSeqs(); s++) { string md5 = Md5ForSeq(alnObj, s); fprintf(fp, "md5 for seq %d: %s\n", s, md5.c_str()); } #else fprintf(fp, "md5: disabled\n"); #endif fclose(fp); } /* log some statistics for aligned sequences, i.e. alnObj should hold * the already aligned sequences * */ void Stats::logAlignedSeqStats(Alignment *alnObj) { FILE *fp = fopen(logfilename.c_str(), "a"); if (fp == NULL) { cerr << "couldn't open file " << logfilename << " for logging of stats\n"; return; } // alignment length is the length of any sequence fprintf(fp, "aln len: %d\n", alnObj->getSeqLength(1)); std::vector pwIdents; double lowestPwId = 1.0; double hightestPwId = 0.0; // create vector of pairwise identities for(int s1 = 1; s1 <= alnObj->getNumSeqs(); s1++) { for(int s2 = s1+1; s2 <= alnObj->getNumSeqs(); s2++) { double thisPwId = pairwiseIdentity(alnObj, s1, s2); pwIdents.push_back(thisPwId); if (thisPwId>hightestPwId) hightestPwId=thisPwId; if (thisPwIdgetName(s1).c_str(), alnObj->getName(s2).c_str(), s1, s2, PairwiseIdentity(alnObj, s1, s2)); } } fprintf(fp, "aln pw-id highest: %.2f\n", hightestPwId); fprintf(fp, "aln pw-id lowest: %.2f\n", lowestPwId); fprintf(fp, "aln pw-id avg: %.2f\n", utilityObject->average(pwIdents)); fprintf(fp, "aln pw-id std-dev: %.2f\n", utilityObject->stdDev(pwIdents)); fprintf(fp, "aln pw-id median: %.2f\n", utilityObject->median(pwIdents)); fclose(fp); } }