/* -*- mode: c; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */ /********************************************************************* * Clustal Omega - Multiple sequence alignment * * Copyright (C) 2010 University College Dublin * * Clustal-Omega is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This file is part of Clustal-Omega. * ********************************************************************/ /* * RCS $Id: hhhitlist-C.h 243 2011-05-31 13:49:19Z fabian $ */ // hhhitlist.C #ifndef MAIN #define MAIN #include // cin, cout, cerr #include // ofstream, ifstream #include // printf #include // exit #include // strcmp, strstr #include // sqrt, pow #include // INT_MIN #include // FLT_MIN #include // clock #include // islower, isdigit etc using std::ios; using std::ifstream; using std::ofstream; using std::cout; using std::cerr; using std::endl; #include "util-C.h" // imax, fmax, iround, iceil, ifloor, strint, strscn, strcut, substr, uprstr, uprchr, Basename etc. #include "list.h" // list data structure #include "hash.h" // hash data structure #include "hhdecl-C.h" // constants, class #include "hhutil-C.h" // imax, fmax, iround, iceil, ifloor, strint, strscn, strcut, substr, uprstr, uprchr, Basename etc. #include "hhhmm.h" // class HMM #include "hhalignment.h" // class Alignment #include "hhhit.h" #include "hhhalfalignment.h" #include "hhfullalignment.h" #endif ////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////// //// Methods of class HitList ////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////// /** * @brief Print summary listing of hits */ void HitList::PrintHitList(HMM& q, char* outfile) { Hit hit; int nhits=0; char str[NAMELEN]=""; FILE* outf=NULL; if (strcmp(outfile,"stdout")) { outf=fopen(outfile,"w"); if (!outf) OpenFileError(outfile); } else outf = stdout; fprintf(outf,"Query %s\n",q.longname); // fprintf(outf,"Family %s\n",q.fam); fprintf(outf,"Match_columns %i\n",q.L); fprintf(outf,"No_of_seqs %i out of %i\n",q.N_filtered,q.N_in); fprintf(outf,"Neff %-4.1f\n",q.Neff_HMM); fprintf(outf,"Searched_HMMs %i\n",N_searched); // Print date stamp time_t* tp=new(time_t); *tp=time(NULL); fprintf(outf,"Date %s",ctime(tp)); delete (tp); (tp) = NULL; // Print command line fprintf(outf,"Command "); for (int i=0; i ",(int)strlen(par.argv[i])); fprintf(outf,"\n\n"); #ifdef WINDOWS if (par.trans) fprintf(outf," No Hit Prob E-trans E-value Score SS Cols Query HMM Template HMM\n"); else fprintf(outf," No Hit Prob E-value P-value Score SS Cols Query HMM Template HMM\n"); #else if (par.trans) fprintf(outf," No Hit Prob E-trans E-value Score SS Cols Query HMM Template HMM\n"); else fprintf(outf," No Hit Prob E-value P-value Score SS Cols Query HMM Template HMM\n"); #endif Reset(); while (!End()) // print hit list { hit = ReadNext(); if (nhits>=par.Z) break; //max number of lines reached? if (nhits>=par.z && hit.Probab < par.p) break; if (nhits>=par.z && hit.Eval > par.E) continue; // if (hit.matched_cols <=1) continue; // adding this might get to intransparent... analogous statement in PrintAlignments nhits++; sprintf(str,"%3i %-30.30s ",nhits,hit.longname); #ifdef WINDOWS if (par.trans) // Transitive scoring fprintf(outf,"%-34.34s %5.1f %8.2G %8.2G %6.1f %5.1f %4i ",str,hit.Probab,hit.E1val,hit.Eval,hit.score,hit.score_ss,hit.matched_cols); else // Normal scoring fprintf(outf,"%-34.34s %5.1f %8.2G %8.2G %6.1f %5.1f %4i ",str,hit.Probab,hit.Eval,hit.Pval,hit.score,hit.score_ss,hit.matched_cols); #else if (par.trans) // Transitive scoring fprintf(outf,"%-34.34s %5.1f %7.2G %7.2G %6.1f %5.1f %4i ",str,hit.Probab,hit.E1val,hit.Eval,hit.score,hit.score_ss,hit.matched_cols); else // Normal scoring fprintf(outf,"%-34.34s %5.1f %7.2G %7.2G %6.1f %5.1f %4i ",str,hit.Probab,hit.Eval,hit.Pval,hit.score,hit.score_ss,hit.matched_cols); #endif sprintf(str,"%4i-%-4i ",hit.i1,hit.i2); fprintf(outf,"%-10.10s",str); sprintf(str,"%4i-%-4i",hit.j1,hit.j2); fprintf(outf,"%-9.9s(%i)\n",str,hit.L); } //end print hit list fprintf(outf,"\n"); if (strcmp(outfile,"stdout")) fclose(outf); } ////////////////////////////////////////////////////////////////////////////// /** * @brief Print alignments of query sequences against hit sequences */ int HitList::PrintAlignments( #ifdef CLUSTALO char **ppcFirstProf, char **ppcSecndProf, #endif HMM& q, char* outfile, char outformat) { Hit hit; FullAlignment qt_ali(par.nseqdis+10); // maximum 10 annotation (pseudo) sequences (ss_dssp, sa_dssp, ss_pred, ss_conf, consens,...) int nhits=0; #ifndef CLUSTALO_NOFILE FILE* outf=NULL; if (strcmp(outfile,"stdout")) { if (outformat==0) outf=fopen(outfile,"a"); //append to summary hitlist else outf=fopen(outfile,"w"); //open for writing if (!outf) OpenFileError(outfile); } else outf = stdout; #endif Reset(); while (!End()) // print hit list { if (nhits>=par.B) break; //max number of lines reached? hit = ReadNext(); if (nhits>=par.b && hit.Probab < par.p) break; if (nhits>=par.b && hit.Eval > par.E) continue; // // adding this might get to intransparent... // // analogous statement in PrintHitlist and hhalign.C // if (hit.matched_cols <=1) continue; nhits++; // Build double alignment of query against template sequences int iBuildRet = qt_ali.Build(q,hit); if (iBuildRet != OK){ /* FS, r241 -> r243 */ fprintf(stderr, "%s:%s:%d: qt_ali.Build failed\n", __FUNCTION__, __FILE__, __LINE__); return FAILURE; } #ifndef CLUSTALO // Print out alignment if (outformat==0) // HHR format { fprintf(outf,"No %-3i\n",nhits); qt_ali.PrintHeader(outf,q,hit); qt_ali.PrintHHR(outf,hit); } else if (outformat==1) // FASTA format { fprintf(outf,"# No %-3i\n",nhits); qt_ali.PrintFASTA(outf,hit); } else if(outformat==2) // A2M format { fprintf(outf,"# No %-3i\n",nhits); qt_ali.PrintA2M(outf,hit); } else // A3m format { fprintf(outf,"# No %-3i\n",nhits); qt_ali.PrintA3M(outf,hit); } #else qt_ali.OverWriteSeqs(ppcFirstProf, ppcSecndProf); #endif qt_ali.FreeMemory(); } #ifndef CLUSTALO_NOFILE if (strcmp(outfile,"stdout")) fclose(outf); #endif return OK; } /* this is the end of HitList::PrintAlignments() */ //////////////////////////////////////////////////////////////////////////// /** * @brief Return the ROC_5 score for optimization * (changed 28.3.08 by Michael & Johannes) */ void HitList::Optimize(HMM& q, char* buffer) { const int NFAM =5; // calculate ROC_5 score const int NSFAM=5; // calculate ROC_5 score int roc=0; // ROC score int fam=0; // number of hits from same family (at current threshold) int not_fam=0; // number of hits not from same family int sfam=0; // number of hits from same suporfamily (at current threshold) int not_sfam=0; // number of hits not from same superfamily Hit hit; SortList(); Reset(); while (!End()) { hit = ReadNext(); if (!strcmp(hit.fam,q.fam)) fam++; // query and template from same superfamily? => positive else if (not_fam negative { not_fam++; roc += fam; } if (!strcmp(hit.sfam,q.sfam)) sfam++; // query and template from same superfamily? => positive else if (not_sfam negative { not_sfam++; roc += sfam; } // printf("qfam=%s tfam=%s qsfam=%s tsfam=%s fam=%-2i not_fam=%3i sfam=%-3i not_sfam=%-5i roc=%-3i\n",q.fam,hit.fam,q.sfam,hit.sfam,fam,not_fam,sfam,not_sfam,roc); } // Write ROC score to file or stdout FILE* buf=NULL; if (strcmp(par.buffer,"stdout")) { buf=fopen(buffer,"w"); if (!buf) OpenFileError(par.buffer); } else buf = stdout; fprintf(buf,"%f\n",float(roc)/float(fam*NFAM+sfam*NSFAM)); // must be between 0 and 1 if (v>=2) printf("ROC=%f\n",float(roc)/float(fam*NFAM+sfam*NSFAM)); fclose(buf); } ////////////////////////////////////////////////////////////////////////////// /** * @brief Print score distribution into file score_dist */ void HitList::PrintScoreFile(HMM& q) { int i=0, n; FILE* scoref=NULL; Hit hit; Hash twice(10000); // make sure only one hit per HMM is listed twice.Null(-1); if (strcmp(par.scorefile,"stdout")) { scoref=fopen(par.scorefile,"w"); if (!scoref) {cerr<0.99999) corr=0.99999; } else { if (corr<0.0) corr=0.0; else if (corr>1.0) corr=1.0; } return -s*(1.0-0.5*corr) - corr*log(1.0+s) + log(s*(1.0-0.5*corr)+0.5*corr); // return -s*(1.0-0.5*corr) + log( s*(1.0-corr)*(1.0-0.5*corr)+0.5*corr ); // return -s*(1.0-0.5*corr) + log((s*(1.0-corr)*(1.0-0.5*corr)+corr)*(1.0-0.5*corr)); } ///////////////////////////////////////////////////////////////////////////////////// /** * @brief Evaluate the *negative* log likelihood for the order statistic of the uniform distribution * for the best 10% of hits (vertex v = (corr,offset) ) * The k'th order statistic for X~Uniform is p:=X^(k)~Beta(k,n-k+1) = const*p^(k-1)*(1-p)^(n-k) * Needed to fit the correlation and score offset in HHblast */ double HitList::RankOrderFitCorr(double* v) { double sum=0.0; // printf("%8.2G %8.2G %i\n",v[0],v[1],Nprof); int i1 = imin(Nprof,imax(50,int(0.05*Nprof))); for (int i=0; iRankOrderFitCorr(v); // call member function } ///////////////////////////////////////////////////////////////////////////////////// /** * @brief Evaluate the *negative* log likelihood of the data at the vertex v = (corr,offset) * Needed to fit the correlation and score offset in HHblast */ double HitList::LogLikelihoodCorr(double* v) { double sum=0.0; // printf("%8.2G %8.2G %i\n",v[0],v[1],Nprof); for (int i=0; iLogLikelihoodCorr(v); // call member function } ///////////////////////////////////////////////////////////////////////////////////// /** * @brief Evaluate the *negative* log likelihood of the data at the vertex v = (lamda,mu) * p(s) = lamda * exp{ -exp[-lamda*(s-mu)] - lamda*(s-mu) } = lamda * exp( -exp(-x) - x) */ double HitList::LogLikelihoodEVD(double* v) { double sum=0.0, sumw=0.0; for (int i=0; iLogLikelihoodEVD(v); // call member function } ///////////////////////////////////////////////////////////////////////////////////// /** * @brief Subroutine to FindMin: try new point given by highest point ihigh and fac and replace ihigh if it is lower */ double HitList::TryPoint(const int ndim, double* p, double* y, double* psum, int ihigh, double fac, double (*Func)(void* pt2hitlist, double* v)) { // New point p_try = p_c + fac*(p_high-p_c), // where p_c = ( sum_i (p_i) - p_high)/ndim is the center of ndim other points // => p_try = fac1*sum_i(p_i) + fac2*p_high double fac1=(1.-fac)/ndim; double fac2=fac-fac1; double ptry[ndim]; //new point to try out double ytry; //function value of new point int j; //index for the ndim parameters for (j=0; j=4) printf("Trying: %-7.3f %-7.3f %-7.3f -> accept\n",ptry[0],ptry[1],ytry); y[ihigh]=ytry; for (j=0; j=4) printf("Trying: %-7.3f %-7.3f %-7.3f -> reject\n",ptry[0],ptry[1],ytry); return ytry; } ///////////////////////////////////////////////////////////////////////////////////// /** * @brief Find minimum with simplex method of Nelder and Mead (1965) */ float HitList::FindMin(const int ndim, double* p, double* y, double tol, int& nfunc, double (*Func)(void* pt2hitlist, double* v)) { const int MAXNFUNC=99; //maximum allowed number of function evaluations int ihigh; //index of highest point on simplex int inext; //index of second highest point on simplex int ilow; //index of lowest point on simplex int i; //index for the ndim+1 points int j; //index for the ndim parameters double rtol; //tolerance: difference of function value between highest and lowest point of simplex double temp; //dummy double ytry; //function value of trial point double psum[ndim]; //psum[j] = j'th coordinate of sum vector (sum over all vertex vectors) nfunc=0; //number of function evaluations =0 //Calculate sum vector psum[j] for (j=0; jy[1]) {inext=1; ihigh=0;} else {inext=0; ihigh=1;} for (i=0; iy[ihigh]) {inext=ihigh; ihigh=i;} else if (y[i]>y[inext] && i!= ihigh) inext=i; } // If tolerance in y is smaller than tol swap lowest point to index 0 and break -> return rtol = 2.*fabs(y[ihigh]-y[ilow]) / (fabs(y[ihigh])+fabs(y[ilow])+1E-10); if (rtol=MAXNFUNC ) { if (v) fprintf(stderr,"\nWARNING: maximum likelihood fit of score distribution did not converge.\n"); return 1; } nfunc+=2; // Point-reflect highest point on the center of gravity p_c of the other ndim points of the simplex if (v>=3) printf("%3i %-7.3f %-7.3f %-12.8f %-9.3E\n",nfunc,p[ilow*ndim],p[ilow*ndim+1],y[ilow],rtol); // if (v>=2) printf(" %3i %-9.3E %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f\n",nfunc,rtol,p[ilow*ndim],p[ilow*ndim+1],y[ilow],p[inext*ndim],p[inext*ndim+1],y[inext],p[ihigh*ndim],p[ihigh*ndim+1],y[ihigh]); ytry = TryPoint(ndim,p,y,psum,ihigh,-1.0,Func); //reflect highest point on p_c if (ytry<=y[ilow]) { ytry = TryPoint(ndim,p,y,psum,ihigh,2.0,Func); //expand: try new point 2x further away from p_c // if (v>=2) printf("Expanded: %3i %-9.3E %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f\n",nfunc,rtol,p[ilow*ndim],p[ilow*ndim+1],y[ilow],p[inext*ndim],p[inext*ndim+1],y[inext],p[ihigh*ndim],p[ihigh*ndim+1],y[ihigh]); } else if (ytry>=y[inext]) { // The new point is worse than the second worst point temp=y[ihigh]; ytry=TryPoint(ndim,p,y,psum,ihigh,0.5,Func); //contract simplex by 0.5 along (p_high-p_c // if (v>=2) printf("Compressed:%3i %-9.3E %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f\n",nfunc,rtol,p[ilow*ndim],p[ilow*ndim+1],y[ilow],p[inext*ndim],p[inext*ndim+1],y[inext],p[ihigh*ndim],p[ihigh*ndim+1],y[ihigh]); if (ytry>=temp) { // Trial point is larger than worst point => contract simplex by 0.5 towards lowest point for (i=0; i=2) printf("Contracted:%3i %-9.3E %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f\n",nfunc,rtol,p[ilow*ndim],p[ilow*ndim+1],y[ilow],p[inext*ndim],p[inext*ndim+1],y[inext],p[ihigh*ndim],p[ihigh*ndim+1],y[ihigh]); //Calculate psum[j] for (j=0; j size_fam(MAXPROF/10); // Hash counts number of HMMs in family static Hash size_sfam(MAXPROF/10); // Hash counts number of families in superfamily Hash excluded(50); // Hash containing names of superfamilies to be excluded from fit size_fam.Null(0); // Set int value to return when no data can be retrieved size_sfam.Null(0); // Set int value to return when no data can be retrieved excluded.Null(0); // Set int value to return when no data can be retrieved Hit hit; double mu; // EVD[mu,lam](x) = exp(-exp(-(x-mu)/lam)) = P(score<=x) double vertex[2*3]; // three vertices of the simplex in lamda-mu plane double yvertex[3]; // log likelihood values at the three vertices of the simplex int nfunc=0; // number of function calls double sum_weights=0.0; float sum_scores=0.0; float rtol; if (first_call==1) { first_call=0; // Count how many HMMs are in each family; set number of multiple hits per template nrep if (v>=4) printf(" count number of profiles in each family and families in each superfamily ...\n"); Reset(); while (!End()) { hit = ReadNext(); if (!size_fam.Contains(hit.fam)) (*size_sfam(hit.sfam))++; //Add one to hash element for superfamily (*size_fam(hit.fam))++; //Add one to hash element for family // printf("size(%s)=%i name=%s\n",hit.fam,*size_fam(hit.fam),hit.name) } fams=size_fam.Size(); sfams=size_sfam.Size(); if (v>=3) printf("%-3i HMMs from %i families and %i superfamilies searched. Found %i hits\n",N_searched,fams,sfams,Size()); } // Query has SCOP family identifier? if (q.fam && q.fam[0]>='a' && q.fam[0]<='k' && q.fam[1]=='.') { char sfamid[NAMELEN]; char* ptr_in_fam=q.fam; while ((ptr_in_fam=strwrd(sfamid,ptr_in_fam,'-'))) { char* ptr=strrchr(sfamid,'.'); if (ptr) *ptr='\0'; excluded.Add(sfamid); // fprintf(stderr,"Exclude SCOP superfamily %s ptr_in_fam='%s'\n",sfamid,ptr_in_fam); } } // Exclude best superfamilies from fit else if (nbest>0) { if (sfams<97+nbest) return; // Find the nbest best-scoring superfamilies for exclusion from first ML fit if (v>=4) printf(" find %i best-scoring superfamilies to exclude from first fit ...\n",nbest); hit = Smallest(); excluded.Add(hit.sfam); // printf("Exclude in first round: %s %8.2f %s\n",hit.name,hit.score_aass,hit.sfam); while (excluded.Size()=4) printf(" find best-scoring superfamilies to exclude from second fit ...\n"); Reset(); while (!End()) { hit = ReadNext(); if (hit.Eval < 0.05) excluded.Add(hit.sfam); // changed from 0.5 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! } tol = 0.001/size_sfam.Size(); // tol=1/N would lead to delta(log-likelihood)~1 (where N ~ number of superfamilies) since (1+1/N)^N = e } // Put scores into score[] and weights into weight[] if (v>=3) printf(" generate scores and weights array for ML fitting ...\n"); Nprof=0; Reset(); while (!End()) { hit = ReadNext(); if (hit.irep > 1) continue; //Use only best hit per template if (Nprof>=MAXPROF) break; char sfamid[NAMELEN]; char* ptr_in_fam=hit.fam; while ((ptr_in_fam=strwrd(sfamid,ptr_in_fam,'-'))) { char* ptr=strrchr(sfamid,'.'); if (ptr) *ptr='\0'; if (excluded.Contains(sfamid)) break; //HMM is among superfamilies to be excluded } if (excluded.Contains(sfamid)) { if (v>=3) fprintf(stderr,"Exclude hit %s (family %s contains %s)\n",hit.name,hit.fam,sfamid); continue; } // ScopID(hit.cl,hit.fold,hit.sfam,hit.fam); //Get scop superfamily code for template // if (*hit.sfam=='\0' || excluded.Contains(hit.sfam)) continue; // skip HMM score[Nprof] = hit.score; weight[Nprof]=1./size_fam[hit.fam]/size_sfam[hit.sfam]; sum_scores +=hit.score*weight[Nprof]; sum_weights+=weight[Nprof]; //DEBUG // if (v>=4) printf("%-10.10s %-12.12s %-3i %-12.12s %-3i %6.4f %6.4f %7.1f\n",hit.name,hit.fam,size_fam[hit.fam],hit.sfam,size_sfam[hit.sfam],1./size_fam[hit.fam]/size_sfam[hit.sfam],sum,hit.score); Nprof++; } //DEBUG if (v>=3) printf("%i hits used for score distribution\n",Nprof); // for (int i=0; i0) {vertex[0]=LAMDA; vertex[1]=mu;} /////////////////////////////////////////// DEBUG else {vertex[0]=q.lamda; vertex[1]=mu;} vertex[2]=vertex[0]+0.1; vertex[3]=vertex[1]; vertex[4]=vertex[0]; vertex[5]=vertex[1]+0.2; yvertex[0]=Func(this,vertex ); yvertex[1]=Func(this,vertex+2); yvertex[2]=Func(this,vertex+4); // Find lam and mu that minimize negative log likelihood of data if (v>=3) printf("Fitting to EVD by maximum likelihood...\niter lamda mu -log(P)/N tol\n"); rtol = FindMin(2,vertex,yvertex,tol,nfunc,Func); if (v>=3) printf("%3i %-7.3f %-7.2f %-7.3f %-7.1E\n\n",nfunc,vertex[0],vertex[1],yvertex[0]-(1.5772-log(vertex[0])),rtol); // printf("HHsearch lamda=%-6.3f mu=%-6.3f\n",vertex[0],vertex[1]); } else { vertex[0]=LAMDA_GLOB; vertex[1]=mu; } // Set lamda and mu of profile q.lamda = vertex[0]; q.mu = vertex[1]; // Set P-values and E-values // CHECK UPDATE FROM score=-logpval to score=-logpval+SSSCORE2NATLOG*score_ss !!!! Reset(); while (!End()) { hit = ReadNext(); // Calculate total score in raw score units: P-value = 1- exp(-exp(-lamda*(Saa-mu))) hit.weight=1./size_fam[hit.fam]/size_sfam[hit.sfam]; // needed for transitive scoring hit.logPval = logPvalue(hit.score,vertex); hit.Pval=Pvalue(hit.score,vertex); hit.Eval=exp(hit.logPval+log(N_searched)); // hit.score_aass = hit.logPval/0.45-3.0 - hit.score_ss; // median(lamda)~0.45, median(mu)~4.0 in EVDs for scop20.1.63 HMMs hit.score_aass = -q.lamda*(hit.score-q.mu)/0.45-3.0 - fmin(hit.score_ss,fmax(0.0,0.5*hit.score-5.0)); // median(lamda)~0.45, median(mu)~3.0 in EVDs for scop20.1.63 HMMs hit.Probab = Probab(hit); if (nbest>0 && par.loc) // correct length correction (not needed for second round of fitting, since lamda very similar) if (par.idummy==0) //////////////////////////////////////////// hit.score += log(q.L*hit.L)*(1/LAMDA-1/vertex[0]); hit.score_sort = hit.score_aass; Overwrite(hit); // copy hit object into current position of hitlist if (nbest==0 && par.trans==1) // if in transitive scoring mode (weights file given) TransitiveScoring(); else if (nbest==0 && par.trans==2) // if in transitive scoring mode (weights file given) TransitiveScoring2(); else if (nbest==0 && par.trans==3) // if in transitive scoring mode (weights file given) TransitiveScoring3(); else if (nbest==0 && par.trans==4) // if in transitive scoring mode (weights file given) TransitiveScoring4(); } } ///////////////////////////////////////////////////////////////////////////////////// /** * @brief Calculate correlation and score offset for HHblast composite E-values */ void HitList::CalculateHHblastCorrelation(HMM& q) { int nfunc=0; // number of function calls double tol; // Maximum relative tolerance when minimizing -log(P)/N (~likelihood) double vertex[2*3]; // three vertices of the simplex in lamda-mu plane double yvertex[3]; // log likelihood values at the three vertices of the simplex Hit hit; Hash excluded(50); // Hash containing names of superfamilies to be excluded from fit excluded.Null(0); // Set int value to return when no data can be retrieved // Set sum of HHsearch and PSI-BLAST score for calculating correlation Reset(); while (!End()) { hit = ReadNext(); hit.score_sort = hit.logPval + blast_logPvals->Show(hit.name); // if template not in hash, return log Pval = 0, i.e. Pvalue = 1! Overwrite(hit); // copy hit object into current position of hitlist } // Query has SCOP family identifier? if (q.fam && q.fam[0]>='a' && q.fam[0]<='k' && q.fam[1]=='.') { char sfamid[NAMELEN]; char* ptr_in_fam=q.fam; while ((ptr_in_fam=strwrd(sfamid,ptr_in_fam,'-'))) { char* ptr=strrchr(sfamid,'.'); if (ptr) *ptr='\0'; excluded.Add(sfamid); fprintf(stderr,"Exclude SCOP superfamily %s ptr_in_fam='%s'\n",sfamid,ptr_in_fam); } } // Resort list by sum of log P-values ResortList(); // use InsertSort to resort list according to sum of minus-log-Pvalues Nprof=0; Reset(); ReadNext(); // skip best hit while (!End()) { hit = ReadNext(); if (hit.irep>=2) continue; // use only best alignments // if (hit.Eval<0.005) {if (v>=3) printf("Fitting HHblast correlation coefficient: skipping %s with Evalue=%9.1g\n",hit.name,hit.Eval); continue;} if (Nprof>=MAXPROF) break; char sfamid[NAMELEN]; char* ptr_in_fam=hit.fam; while ((ptr_in_fam=strwrd(sfamid,ptr_in_fam,'-'))) { char* ptr=strrchr(sfamid,'.'); if (ptr) *ptr='\0'; if (excluded.Contains(sfamid)) break; //HMM is among superfamilies to be excluded } if (excluded.Contains(sfamid)) { if (v>=1) fprintf(stderr,"Exclude hit %s (family %s contains %s)\n",hit.name,hit.fam,sfamid); continue; } score[Nprof] = -hit.score_sort; weight[Nprof] = 1.0; // = hit.weight; // printf("%3i %-12.12s %7.3f + %7.3f = %7.3f \n",Nprof,hit.name,hit.logPval,blast_logPvals->Show(hit.name),-hit.score_sort); ////////////////////// printf("%3i %7.3f %7.3f\n",Nprof,hit.Pval,exp(blast_logPvals->Show(hit.name))); ////////////////////// Nprof++; } // Fit correlation vertex[0]=0.5; vertex[1]=0.2; vertex[2]=vertex[0]+0.2; vertex[3]=vertex[1]; vertex[4]=vertex[0]; vertex[5]=vertex[1]+0.2; yvertex[0]=RankOrderFitCorr(vertex ); yvertex[1]=RankOrderFitCorr(vertex+2); yvertex[2]=RankOrderFitCorr(vertex+4); // yvertex[0]=LogLikelihoodCorr(vertex ); // yvertex[1]=LogLikelihoodCorr(vertex+2); // yvertex[2]=LogLikelihoodCorr(vertex+4); tol = 1e-6; v=3;////////////////////////////////// // Find correlation and offset that minimize mean square deviation of reported composite Pvalues from actual if (v>=2) printf("Fitting correlation coefficient for HHblast...\niter corr offset logLikelihood tol\n"); float rtol = FindMin(2,vertex,yvertex,tol,nfunc, HitList::RankOrderFitCorr_static); if (v>=2) printf("%3i %-7.3f %-7.2f %-7.3f %-7.1E\n\n",nfunc,vertex[0],vertex[1],yvertex[0],rtol); if (vertex[0]<0) vertex[0]=0.0; // Print correlation and offset for profile printf("HHblast correlation=%-6.3f score offset=%-6.3f\n",vertex[0],vertex[1]); v=2;////////////////////////////////// } /** * @brief Calculate HHblast composite E-values */ inline double corr_HHblast(float Nq, float Nt) { return 0.5; } /** * @brief Calculate HHblast composite E-values */ inline double offset_HHblast(float Nq, float Nt) { return 0.0; } ////////////////////////////////////////////////////////////////////////////// /** * @brief Calculate HHblast composite E-values */ void HitList::CalculateHHblastEvalues(HMM& q) { Hit hit; float corr, offset; // correlation coefficient and offset for calculating composite HHblast P-values Reset(); while (!End()) { hit = ReadNext(); corr = corr_HHblast(q.Neff_HMM,hit.Neff_HMM); offset = offset_HHblast(q.Neff_HMM,hit.Neff_HMM); hit.score_sort = hit.logPval + blast_logPvals->Show(hit.name); hit.logPval = logPvalue_HHblast(-hit.score_sort+offset,corr); // overwrite logPval from HHsearch with composite logPval from HHblast hit.Pval = Pvalue_HHblast(-hit.score_sort+offset,corr); // overwrite P-value from HHsearch with composite P-value from HHblast hit.Eval = exp(hit.logPval+log(N_searched)); // overwrite E-value from HHsearch with composite E-value from HHblast hit.Probab = Probab(hit); Overwrite(hit); // copy hit object into current position of hitlist } ResortList(); // use InsertSort to resort list according to sum of minus-log-Pvalues } ////////////////////////////////////////////////////////////////////////////// /** * @brief Read file generated by blastpgp (default output) and store P-values in hash */ void HitList::ReadBlastFile(HMM& q) { char line[LINELEN]=""; // input line int Ndb; // number of sequences in database int Ldb=0; // size of database in number of amino acids char* templ; int i; if (!blast_logPvals) { blast_logPvals = new(Hash); blast_logPvals->New(16381,0); } FILE* blaf = NULL; if (!strcmp(par.blafile,"stdin")) blaf=stdin; else { blaf = fopen(par.blafile,"rb"); if (!blaf) OpenFileError(par.blafile); } // Read number of sequences and size of database while (fgetline(line,LINELEN-1,blaf) && !strstr(line,"sequences;")); if (!strstr(line,"sequences;")) FormatError(par.blafile,"No 'Database:' string found."); char* ptr=line; Ndb = strint(ptr); if (Ndb==INT_MIN) FormatError(par.blafile,"No integer for number of sequences in database found."); while ((i=strint(ptr))>INT_MIN) Ldb = 1000*Ldb + i; if (Ldb==0) FormatError(par.blafile,"No integer for size of database found."); printf("\nNumber of sequences in database = %i Size of database = %i\n",Ndb,Ldb); // Read all E-values and sequence lengths while (fgetline(line,LINELEN-1,blaf)) { if (line[0]=='>') { // Read template name templ = new(char[255]); ptr = line+1; strwrd(templ,ptr); if (!blast_logPvals->Contains(templ)) // store logPval only for best HSP with template { // Read length while (fgetline(line,LINELEN-1,blaf) && !strstr(line,"Length =")); ptr = line+18; int length = strint(ptr); // Read E-value fgetline(line,LINELEN-1,blaf); fgetline(line,LINELEN-1,blaf); float EvalDB; // E-value[seq-db] = Evalue for comparison Query vs. database, from PSI-BLAST float EvalQT; // E-value[seq-seq] = Evalue for comparison Query vs. template (seq-seq) double logPval; ptr = strstr(line+20,"Expect ="); if (!ptr) FormatError(par.blafile,"No 'Expect =' string found."); if (sscanf(ptr+8,"%g",&EvalDB)<1) { ptr[7]='1'; if (sscanf(ptr+7,"%g",&EvalDB)<1) FormatError(par.blafile,"No Evalue found after 'Expect ='."); } // Calculate P-value[seq-seq] = 1 - exp(-E-value[seq-seq]) = 1 - exp(-Lt/Ldb*E-value[seq-db]) EvalQT = length/double(Ldb)*double(EvalDB); if (EvalQT>1E-3) logPval = log(1.0-exp(-EvalQT)); else logPval=log(double(EvalQT)+1.0E-99); blast_logPvals->Add(templ,logPval); printf("template=%-10.10s length=%-3i EvalDB=%8.2g EvalQT=%8.2g P-value=%8.2g log Pval=%8.2g\n",templ,length,EvalDB,EvalQT,exp(logPval),logPval); } else { delete[] templ; templ = NULL; } } } fclose(blaf); } ///////////////////////////////////////////////////////////////////////////////////// /** * @brief Calculate output of hidden neural network units */ inline float calc_hidden_output(const float* weights, const float* bias, float Lqnorm, float Ltnorm, float Nqnorm, float Ntnorm) { float res; // Calculate activation of hidden unit = sum of all inputs * weights + bias res = Lqnorm*weights[0] + Ltnorm*weights[1] + Nqnorm*weights[2] + Ntnorm*weights[3] + *bias; res = 1.0 / (1.0 + exp(-(res ))); // logistic function return res; } //////////////////////////////////////////////////////////////////////////////////// /** * @brief Neural network regressions of lamda for EVD */ inline float lamda_NN(float Lqnorm, float Ltnorm, float Nqnorm, float Ntnorm) { const int inputs = 4; const int hidden = 4; const float biases[] = {-0.73195, -1.43792, -1.18839, -3.01141}; // bias for all hidden units const float weights[] = { // Weights for the neural networks (column = start unit, row = end unit) -0.52356, -3.37650, 1.12984, -0.46796, -4.71361, 0.14166, 1.66807, 0.16383, -0.94895, -1.24358, -1.20293, 0.95434, -0.00318, 0.53022, -0.04914, -0.77046, 2.45630, 3.02905, 2.53803, 2.64379 }; float lamda=0.0; for (int h = 0; h