--- /dev/null
+/**
+ *
+ * Copyright (c) 2007 Des Higgins, Julie Thompson and Toby Gibson.
+ */
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <stdio.h>
+#include <cstdlib>
+#include <iostream>
+#include <algorithm>
+#include <string>
+#include <stdio.h>
+#include "../alignment/Alignment.h"
+#ifdef HAVE_MHASH_H
+#include "mhash.h"
+#endif
+
+#include "Stats.h"
+
+
+using namespace std;
+
+namespace clustalw
+{
+
+
+Stats::Stats()
+{
+ enabled = false;
+}
+
+
+Stats::~Stats()
+{
+}
+
+
+
+
+/* adopted from Sean Eddy'ssquid:aligneval.c */
+/* Function: PairwiseIdentity()
+ *
+ * Purpose: Calculate the pairwise fractional identity between
+ * two aligned sequences s1 and s2. This is simply
+ * (idents / MIN(len1, len2)).
+ *
+ * Note how many ways there are to calculate pairwise identity,
+ * because of the variety of choices for the denominator:
+ * idents/(idents+mismat) has the disadvantage that artifactual
+ * gappy alignments would have high "identities".
+ * idents/(AVG|MAX)(len1,len2) both have the disadvantage that
+ * alignments of fragments to longer sequences would have
+ * artifactually low "identities".
+ *
+ * Case sensitive; also, watch out in nucleic acid alignments;
+ * U/T RNA/DNA alignments will be counted as mismatches!
+ */
+/* float
+PairwiseIdentity(char @s1, char @s2)
+{
+ int idents; /@ total identical positions @/
+ int len1, len2; /@ lengths of seqs @/
+ int x; /@ position in aligned seqs @/
+
+ idents = len1 = len2 = 0;
+ for (x = 0; s1[x] != '\0' && s2[x] != '\0'; x++)
+ {
+ if (!isgap(s1[x])) {
+ len1++;
+ if (s1[x] == s2[x]) idents++;
+ }
+ if (!isgap(s2[x])) len2++;
+ }
+ if (len2 < len1) len1 = len2;
+ return (len1 == 0 ? 0.0 : (float) idents / (float) len1);
+}
+*/
+
+
+/* s1/s2 are unit-offset as usual
+ *
+ */
+float
+Stats::pairwiseIdentity(Alignment *alnObj, int s1, int s2)
+{
+ int idents; /* total identical positions */
+ int len1, len2; /* real lengths of seqs */
+ int x; /* position in aligned seqs */
+
+ const vector<int>* seq1 = alnObj->getSequence(s1);
+ const vector<int>* seq2 = alnObj->getSequence(s2);
+
+ idents = len1 = len2 = 0;
+ // cerr << "comparing " << alnObj->getName(s1).c_str() << ":" << alnObj->getName(s2).c_str() << " " << s1 << ":" << s2 << "\n";
+
+ // sequence length should be identical, but be paranoid
+ for (x = 1; x<=alnObj->getSeqLength(s1) && x<=alnObj->getSeqLength(s2); x++)
+ {
+ if (! alnObj->isGap(s1, x)) {
+ len1++;
+ //cerr << " pos " << x << ": " << (*seq1)[x] << ":" << (*seq2)[x] << "\n";
+ if ((*seq1)[x] == (*seq2)[x])
+ idents++;
+ }
+ //DEBUG
+ //else {
+ //cerr << " gap at pos " << x << " (" << s1 << ")\n";
+ //}
+ if (! alnObj->isGap(s2, x))
+ len2++;
+ //DEBUG
+ //else
+ // cerr << " gap at pos " << x << " (" << s2 << ")\n";
+ }
+ if (len2 < len1)
+ len1 = len2;
+ return (len1 == 0 ? 0.0 : (float) idents / (float) len1);
+}
+
+
+
+#ifdef HAVE_MHASH_H
+
+string
+Stats::ConcatInputHash(Alignment *alnObj)
+{
+ vector<string> rawSeqArray;
+ string ret;
+ char *hash;
+
+ // collect all sequences and sort
+ const clustalw::SeqArray* seqArray = alnObj->getSeqArray();
+ for(int s = 1; s <= alnObj->getNumSeqs(); s++)
+ {
+ string seq;
+ for(int r = 1; r <= alnObj->getSeqLength(s); r++)
+ {
+ int val = (*seqArray)[s][r];
+ seq.append(1, clustalw::userParameters->getAminoAcidCode(val));
+ }
+ rawSeqArray.push_back(seq);
+ }
+ std::sort(rawSeqArray.begin(), rawSeqArray.end());
+
+ // concatenate sorted seqs
+ string concatSeq;
+ std::vector<string>::iterator iter;
+ for(iter=rawSeqArray.begin(); iter != rawSeqArray.end(); ++iter)
+ concatSeq.append(*iter);
+
+ // build hash and return
+ hash = Md5Hash(concatSeq.c_str());
+ if (hash==NULL)
+ {
+ ret="HASHING_FAILURE";
+ } else {
+ for (int i=0; i<strlen(hash); i++)
+ ret.append(1, hash[i]);
+ free(hash);
+ }
+ return ret;
+}
+
+
+string
+Stats::Md5ForSeq(Alignment *alnObj, int s)
+{
+ string seq;
+ const clustalw::SeqArray* seqArray = alnObj->getSeqArray();
+ char *hash;
+ string ret;
+
+ for(int l = 1; l <= alnObj->getSeqLength(s); l++)
+ {
+ int val = (*seqArray)[s][l];
+ // continue if gap
+ if((val < 0) || (val > userParameters->getMaxAA()))
+ continue;
+ seq.append(1, clustalw::userParameters->getAminoAcidCode(val));
+ }
+ std::transform(seq.begin(), seq.end(), seq.begin(), ::toupper);
+
+ hash = Md5Hash(seq.c_str());
+ if (hash==NULL)
+ {
+ ret = "HASHING_FAILURE";
+ } else {
+ ret = hash;
+ }
+ return ret;
+}
+
+/* create md5 hash from input string
+ * returns NULL on failure
+ * user must free returned string
+ */
+char *
+Stats::Md5Hash(const char *thread)
+{
+ MHASH td;
+ char *tmpcstr;
+ int i;
+ unsigned char *hash;
+ char *rethash;
+
+ td = mhash_init(MHASH_MD5);
+ if (td == MHASH_FAILED)
+ return NULL;
+
+ if (thread==NULL)
+ return NULL;
+
+ mhash(td, thread, strlen(thread));
+ //mhash_deinit(td, hash);
+ hash = (unsigned char*) mhash_end(td);
+
+ rethash = (char*) calloc(mhash_get_block_size(MHASH_MD5)*2+1, sizeof(char));
+ for (i = 0; i < mhash_get_block_size(MHASH_MD5); i++) {
+ sprintf(&rethash[i*2], "%.2x", hash[i]);
+ }
+
+ return rethash;
+}
+
+#endif
+// HAVE_MHASH_H
+
+
+
+
+void
+Stats::logCmdLine(int argc, char **argv)
+{
+ FILE *fp = fopen(logfilename.c_str(), "a");
+ if (fp == NULL)
+ {
+ cerr << "couldn't open file " << logfilename << " for logging of stats\n";
+ return;
+ }
+
+ if (argc > 1)
+ {
+ for (int i=1; i<argc; i++)
+ {
+ // remove non-interesting stuff
+ if (strstr(argv[i], "-infile=")==NULL &&
+ strstr(argv[i], "-outfile=")==NULL &&
+ strstr(argv[i], "-stats=")==NULL &&
+ strstr(argv[i], "-align")==NULL)
+ {
+ fprintf(fp, "cmdline non-default arg: %s\n", argv[i]);
+ }
+ }
+ }
+}
+
+
+/* log some statistics for input sequences, i.e. alnObj should hold
+ * the unaligned sequences
+ *
+ */
+void
+Stats::logInputSeqStats(Alignment *alnObj)
+{
+ int i;
+ std::vector<double> lengths;
+ time_t t = time(NULL);
+ tm s = *localtime(&t);
+ int shortest;
+ string hash;
+
+ FILE *fp = fopen(logfilename.c_str(), "a");
+ if (fp == NULL)
+ {
+ cerr << "couldn't open file " << logfilename << " for logging of stats\n";
+ return;
+ }
+
+ fprintf(fp, "logging job: %s on %s", userParameters->getSeqName().c_str(), asctime(&s));
+ fprintf(fp, "clustal version: %s\n", userParameters->getRevisionLevel().c_str());
+
+ fprintf(fp, "seq type: ");
+ if (userParameters->getDNAFlag())
+ fprintf(fp, "DNA");
+ else
+ fprintf(fp, "protein");
+ fprintf(fp, "\n");
+
+ fprintf(fp, "numseqs: %d\n", alnObj->getNumSeqs());
+
+ // create a vector of seq lengths for later
+ // and get shortest seq at the same time
+ shortest=alnObj->getLengthLongestSequence();
+ for (i = 1; i <= alnObj->getNumSeqs(); i++) {
+ int l = alnObj->getSeqLength(i);
+ lengths.push_back(l);
+ if (l<shortest)
+ shortest=l;
+ }
+
+ fprintf(fp, "seqlen longest: %d\n", alnObj->getLengthLongestSequence());
+ fprintf(fp, "seqlen shortest: %d\n", shortest);
+
+ fprintf(fp, "seqlen avg: %.2f\n", utilityObject->average(lengths));
+ fprintf(fp, "seqlen std-dev: %.2f\n", utilityObject->stdDev(lengths));
+ fprintf(fp, "seqlen median: %.2f\n", utilityObject->median(lengths));
+
+
+#ifdef HAVE_MHASH_H
+ //hash = concatInputHash(alnObj);
+ //fprintf(fp, "seq hash: %s\n", hash.c_str());
+
+ for (int s = 1; s <= alnObj->getNumSeqs(); s++) {
+ string md5 = Md5ForSeq(alnObj, s);
+ fprintf(fp, "md5 for seq %d: %s\n", s, md5.c_str());
+
+ }
+#else
+ fprintf(fp, "md5: disabled\n");
+#endif
+
+ fclose(fp);
+}
+
+
+
+/* log some statistics for aligned sequences, i.e. alnObj should hold
+ * the already aligned sequences
+ *
+ */
+void
+Stats::logAlignedSeqStats(Alignment *alnObj)
+{
+ FILE *fp = fopen(logfilename.c_str(), "a");
+ if (fp == NULL)
+ {
+ cerr << "couldn't open file " << logfilename << " for logging of stats\n";
+ return;
+ }
+
+ // alignment length is the length of any sequence
+ fprintf(fp, "aln len: %d\n", alnObj->getSeqLength(1));
+
+
+ std::vector<double> pwIdents;
+ double lowestPwId = 1.0;
+ double hightestPwId = 0.0;
+
+ // create vector of pairwise identities
+ for(int s1 = 1; s1 <= alnObj->getNumSeqs(); s1++)
+ {
+ for(int s2 = s1+1; s2 <= alnObj->getNumSeqs(); s2++)
+ {
+ double thisPwId = pairwiseIdentity(alnObj, s1, s2);
+ pwIdents.push_back(thisPwId);
+ if (thisPwId>hightestPwId)
+ hightestPwId=thisPwId;
+ if (thisPwId<lowestPwId)
+ lowestPwId=thisPwId;
+ //fprintf(fp, "ident %s:%s %d:%d=%f\n", alnObj->getName(s1).c_str(), alnObj->getName(s2).c_str(), s1, s2, PairwiseIdentity(alnObj, s1, s2));
+ }
+ }
+ fprintf(fp, "aln pw-id highest: %.2f\n", hightestPwId);
+ fprintf(fp, "aln pw-id lowest: %.2f\n", lowestPwId);
+ fprintf(fp, "aln pw-id avg: %.2f\n", utilityObject->average(pwIdents));
+ fprintf(fp, "aln pw-id std-dev: %.2f\n", utilityObject->stdDev(pwIdents));
+ fprintf(fp, "aln pw-id median: %.2f\n", utilityObject->median(pwIdents));
+
+ fclose(fp);
+}
+
+}
+