1 /* -*- mode: c; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
3 /*********************************************************************
4 * Clustal Omega - Multiple sequence alignment
6 * Copyright (C) 2010 University College Dublin
8 * Clustal-Omega is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License as
10 * published by the Free Software Foundation; either version 2 of the
11 * License, or (at your option) any later version.
13 * This file is part of Clustal-Omega.
15 ********************************************************************/
18 * RCS $Id: hhhitlist-C.h 243 2011-05-31 13:49:19Z fabian $
25 #include <iostream> // cin, cout, cerr
26 #include <fstream> // ofstream, ifstream
27 #include <stdio.h> // printf
28 #include <stdlib.h> // exit
29 #include <string> // strcmp, strstr
30 #include <math.h> // sqrt, pow
31 #include <limits.h> // INT_MIN
32 #include <float.h> // FLT_MIN
33 #include <time.h> // clock
34 #include <ctype.h> // islower, isdigit etc
41 #include "util-C.h" // imax, fmax, iround, iceil, ifloor, strint, strscn, strcut, substr, uprstr, uprchr, Basename etc.
42 #include "list.h" // list data structure
43 #include "hash.h" // hash data structure
44 #include "hhdecl-C.h" // constants, class
45 #include "hhutil-C.h" // imax, fmax, iround, iceil, ifloor, strint, strscn, strcut, substr, uprstr, uprchr, Basename etc.
46 #include "hhhmm.h" // class HMM
47 #include "hhalignment.h" // class Alignment
49 #include "hhhalfalignment.h"
50 #include "hhfullalignment.h"
54 //////////////////////////////////////////////////////////////////////////////
55 //////////////////////////////////////////////////////////////////////////////
56 //// Methods of class HitList
57 //////////////////////////////////////////////////////////////////////////////
58 //////////////////////////////////////////////////////////////////////////////
62 //////////////////////////////////////////////////////////////////////////////
64 * @brief Print summary listing of hits
67 HitList::PrintHitList(HMM& q, char* outfile)
74 if (strcmp(outfile,"stdout"))
76 outf=fopen(outfile,"w");
77 if (!outf) OpenFileError(outfile);
83 fprintf(outf,"Query %s\n",q.longname);
84 // fprintf(outf,"Family %s\n",q.fam);
85 fprintf(outf,"Match_columns %i\n",q.L);
86 fprintf(outf,"No_of_seqs %i out of %i\n",q.N_filtered,q.N_in);
87 fprintf(outf,"Neff %-4.1f\n",q.Neff_HMM);
88 fprintf(outf,"Searched_HMMs %i\n",N_searched);
91 time_t* tp=new(time_t);
93 fprintf(outf,"Date %s",ctime(tp));
94 delete (tp); (tp) = NULL;
97 fprintf(outf,"Command ");
98 for (int i=0; i<par.argc; i++)
99 if (strlen(par.argv[i])<=par.maxdbstrlen)
100 fprintf(outf,"%s ",par.argv[i]);
102 fprintf(outf,"<%i characters> ",(int)strlen(par.argv[i]));
103 fprintf(outf,"\n\n");
107 fprintf(outf," No Hit Prob E-trans E-value Score SS Cols Query HMM Template HMM\n");
109 fprintf(outf," No Hit Prob E-value P-value Score SS Cols Query HMM Template HMM\n");
112 fprintf(outf," No Hit Prob E-trans E-value Score SS Cols Query HMM Template HMM\n");
114 fprintf(outf," No Hit Prob E-value P-value Score SS Cols Query HMM Template HMM\n");
118 while (!End()) // print hit list
121 if (nhits>=par.Z) break; //max number of lines reached?
122 if (nhits>=par.z && hit.Probab < par.p) break;
123 if (nhits>=par.z && hit.Eval > par.E) continue;
124 // if (hit.matched_cols <=1) continue; // adding this might get to intransparent... analogous statement in PrintAlignments
126 sprintf(str,"%3i %-30.30s ",nhits,hit.longname);
130 if (par.trans) // Transitive scoring
131 fprintf(outf,"%-34.34s %5.1f %8.2G %8.2G %6.1f %5.1f %4i ",str,hit.Probab,hit.E1val,hit.Eval,hit.score,hit.score_ss,hit.matched_cols);
132 else // Normal scoring
133 fprintf(outf,"%-34.34s %5.1f %8.2G %8.2G %6.1f %5.1f %4i ",str,hit.Probab,hit.Eval,hit.Pval,hit.score,hit.score_ss,hit.matched_cols);
135 if (par.trans) // Transitive scoring
136 fprintf(outf,"%-34.34s %5.1f %7.2G %7.2G %6.1f %5.1f %4i ",str,hit.Probab,hit.E1val,hit.Eval,hit.score,hit.score_ss,hit.matched_cols);
137 else // Normal scoring
138 fprintf(outf,"%-34.34s %5.1f %7.2G %7.2G %6.1f %5.1f %4i ",str,hit.Probab,hit.Eval,hit.Pval,hit.score,hit.score_ss,hit.matched_cols);
141 sprintf(str,"%4i-%-4i ",hit.i1,hit.i2);
142 fprintf(outf,"%-10.10s",str);
143 sprintf(str,"%4i-%-4i",hit.j1,hit.j2);
144 fprintf(outf,"%-9.9s(%i)\n",str,hit.L);
145 } //end print hit list
147 if (strcmp(outfile,"stdout")) fclose(outf);
152 //////////////////////////////////////////////////////////////////////////////
154 * @brief Print alignments of query sequences against hit sequences
157 HitList::PrintAlignments(
161 char **ppcFirstProf, char **ppcSecndProf,
163 HMM& q, char* outfile, char outformat)
166 FullAlignment qt_ali(par.nseqdis+10); // maximum 10 annotation (pseudo) sequences (ss_dssp, sa_dssp, ss_pred, ss_conf, consens,...)
169 #ifndef CLUSTALO_NOFILE
171 if (strcmp(outfile,"stdout"))
174 outf=fopen(outfile,"a"); //append to summary hitlist
176 outf=fopen(outfile,"w"); //open for writing
177 if (!outf) OpenFileError(outfile);
184 while (!End()) // print hit list
186 if (nhits>=par.B) break; //max number of lines reached?
188 if (nhits>=par.b && hit.Probab < par.p) break;
189 if (nhits>=par.b && hit.Eval > par.E) continue;
190 // // adding this might get to intransparent...
191 // // analogous statement in PrintHitlist and hhalign.C
192 // if (hit.matched_cols <=1) continue;
195 // Build double alignment of query against template sequences
196 int iBuildRet = qt_ali.Build(q,hit);
197 if (iBuildRet != OK){ /* FS, r241 -> r243 */
198 fprintf(stderr, "%s:%s:%d: qt_ali.Build failed\n",
199 __FUNCTION__, __FILE__, __LINE__);
204 // Print out alignment
205 if (outformat==0) // HHR format
207 fprintf(outf,"No %-3i\n",nhits);
208 qt_ali.PrintHeader(outf,q,hit);
209 qt_ali.PrintHHR(outf,hit);
211 else if (outformat==1) // FASTA format
213 fprintf(outf,"# No %-3i\n",nhits);
214 qt_ali.PrintFASTA(outf,hit);
216 else if(outformat==2) // A2M format
218 fprintf(outf,"# No %-3i\n",nhits);
219 qt_ali.PrintA2M(outf,hit);
223 fprintf(outf,"# No %-3i\n",nhits);
224 qt_ali.PrintA3M(outf,hit);
227 qt_ali.OverWriteSeqs(ppcFirstProf, ppcSecndProf);
232 #ifndef CLUSTALO_NOFILE
233 if (strcmp(outfile,"stdout")) fclose(outf);
238 } /* this is the end of HitList::PrintAlignments() */
244 ////////////////////////////////////////////////////////////////////////////
246 * @brief Return the ROC_5 score for optimization
247 * (changed 28.3.08 by Michael & Johannes)
250 HitList::Optimize(HMM& q, char* buffer)
252 const int NFAM =5; // calculate ROC_5 score
253 const int NSFAM=5; // calculate ROC_5 score
254 int roc=0; // ROC score
255 int fam=0; // number of hits from same family (at current threshold)
256 int not_fam=0; // number of hits not from same family
257 int sfam=0; // number of hits from same suporfamily (at current threshold)
258 int not_sfam=0; // number of hits not from same superfamily
266 if (!strcmp(hit.fam,q.fam)) fam++; // query and template from same superfamily? => positive
267 else if (not_fam<NFAM) // query and template from different family? => negative
272 if (!strcmp(hit.sfam,q.sfam)) sfam++; // query and template from same superfamily? => positive
273 else if (not_sfam<NSFAM) // query and template from different superfamily? => negative
278 // printf("qfam=%s tfam=%s qsfam=%s tsfam=%s fam=%-2i not_fam=%3i sfam=%-3i not_sfam=%-5i roc=%-3i\n",q.fam,hit.fam,q.sfam,hit.sfam,fam,not_fam,sfam,not_sfam,roc);
281 // Write ROC score to file or stdout
283 if (strcmp(par.buffer,"stdout"))
285 buf=fopen(buffer,"w");
286 if (!buf) OpenFileError(par.buffer);
291 fprintf(buf,"%f\n",float(roc)/float(fam*NFAM+sfam*NSFAM)); // must be between 0 and 1
292 if (v>=2) printf("ROC=%f\n",float(roc)/float(fam*NFAM+sfam*NSFAM));
298 //////////////////////////////////////////////////////////////////////////////
300 * @brief Print score distribution into file score_dist
303 HitList::PrintScoreFile(HMM& q)
308 Hash<int> twice(10000); // make sure only one hit per HMM is listed
311 if (strcmp(par.scorefile,"stdout"))
313 scoref=fopen(par.scorefile,"w");
315 {cerr<<endl<<"WARNING from "<<par.argv[0]<<": could not open \'"<<par.scorefile<<"\'\n"; return;}
320 fprintf(scoref,"NAME %s\n",q.longname);
321 fprintf(scoref,"FAM %s\n",q.fam);
322 fprintf(scoref,"FILE %s\n",q.file);
323 fprintf(scoref,"LENG %i\n",q.L);
324 fprintf(scoref,"\n");
325 //fprintf(scoref,"TARGET REL LEN COL LOG-PVA S-TOT MS NALI\n");
327 //For hhformat, the PROBAB field has to start at position 41 !!
328 // ----+----1----+----2----+----3----+----4----+----
329 fprintf(scoref,"TARGET FAMILY REL LEN COL LOG-PVA S-AASS PROBAB SCORE\n");
330 // d153l__ 5 185 185 287.82 464.22 100.00
331 // d1qsaa2 3 168 124 145.55 239.22 57.36
336 if (twice[hit.name]==1) continue; // better hit with same HMM has been listed already
337 twice.Add(hit.name,1);
338 //if template and query are from the same superfamily
339 if (!strcmp(hit.name,q.name)) n=5;
340 else if (!strcmp(hit.fam,q.fam)) n=4;
341 else if (!strcmp(hit.sfam,q.sfam)) n=3;
342 else if (!strcmp(hit.fold,q.fold)) n=2;
343 else if (!strcmp(hit.cl,q.cl)) n=1;
345 fprintf(scoref,"%-10s %-10s %1i %3i %3i %s %7.2f %6.2f %7.2f\n",hit.name,hit.fam,n,hit.L,hit.matched_cols,sprintg(-1.443*hit.logPval,7),-hit.score_aass,hit.Probab,hit.score);
352 logPvalue_HHblast(double s, double corr)
354 return -s*(1.0-0.5*corr) + (1.0-corr)*log(1.0+s);
355 // return -s*(1.0-0.5*corr) + log( 1.0+(1.0-corr)*s );
356 // return -s*(1.0-0.5*corr) + log( 1.0+(1.0-corr)*(1.0-0.5*corr)*s );
360 Pvalue_HHblast(double s, double corr)
362 return exp(-s*(1.0-0.5*corr)) * pow(1.0+s,1.0-corr);
363 // return exp(-s*(1.0-0.5*corr)) * ( 1.0+(1.0-corr)*s );
364 // return exp(-s*(1.0-0.5*corr)) * ( 1.0+(1.0-corr)*(1.0-0.5*corr)*s );
368 logLikelihood_HHblast(double s, double corr)
370 if (s<0.0) { s=0.0; if (corr<1E-5) corr=1E-5; else if (corr>0.99999) corr=0.99999; }
371 else { if (corr<0.0) corr=0.0; else if (corr>1.0) corr=1.0; }
372 return -s*(1.0-0.5*corr) - corr*log(1.0+s) + log(s*(1.0-0.5*corr)+0.5*corr);
373 // return -s*(1.0-0.5*corr) + log( s*(1.0-corr)*(1.0-0.5*corr)+0.5*corr );
374 // return -s*(1.0-0.5*corr) + log((s*(1.0-corr)*(1.0-0.5*corr)+corr)*(1.0-0.5*corr));
377 /////////////////////////////////////////////////////////////////////////////////////
379 * @brief Evaluate the *negative* log likelihood for the order statistic of the uniform distribution
380 * for the best 10% of hits (vertex v = (corr,offset) )
381 * The k'th order statistic for X~Uniform is p:=X^(k)~Beta(k,n-k+1) = const*p^(k-1)*(1-p)^(n-k)
382 * Needed to fit the correlation and score offset in HHblast
385 HitList::RankOrderFitCorr(double* v)
388 // printf("%8.2G %8.2G %i\n",v[0],v[1],Nprof);
389 int i1 = imin(Nprof,imax(50,int(0.05*Nprof)));
390 for (int i=0; i<i1; i++)
392 double p = Pvalue_HHblast(score[i]+v[1],v[0]);
393 // sum -= (1.0-double(i)/double(i1)) * weight[i] * ( double(i)*log(p) + (Nprof-i-1.0)*log(1.0-p) );
394 float diff = p-(float(i)+1.0)/(Nprof+1.0);
395 sum += (1.0-double(i)/double(i1)) * weight[i]*diff*diff*(Nprof+1.0)*(Nprof+1.0)*(Nprof+2.0)/(i+10.0)/(Nprof-i);
396 // printf("%-3i Pval=%7.5f Preal=%7.5f diff=%7.5f sum=%7.5f\n",i,p,float(i+1)/(1.0+Nprof),diff,sum);
402 * @brief Static wrapper-function for calling the nonstatic member function RankOrderFitCorr()
403 * ( see http://www.newty.de/fpt/callback.html#member )
406 HitList::RankOrderFitCorr_static(void* pt2hitlist, double* v)
408 HitList* mySelf = (HitList*) pt2hitlist; // explicitly cast to a pointer to Hitlist
409 return mySelf->RankOrderFitCorr(v); // call member function
412 /////////////////////////////////////////////////////////////////////////////////////
414 * @brief Evaluate the *negative* log likelihood of the data at the vertex v = (corr,offset)
415 * Needed to fit the correlation and score offset in HHblast
418 HitList::LogLikelihoodCorr(double* v)
421 // printf("%8.2G %8.2G %i\n",v[0],v[1],Nprof);
422 for (int i=0; i<Nprof; i++)
424 sum -= weight[i]*logLikelihood_HHblast(score[i]+v[1],v[0]);
425 // printf("%-3i Pval=%7.5f Preal=%7.5f diff=%7.5f rmsd=%7.5f sum=%7.5f\n",i,Pvalue_HHblast(score[i],v[0]),float(i)/(1.0+Nprof),x,sqrt(sum/sumw),sum);
431 * @brief Static wrapper-function for calling the nonstatic member function LogLikelihoodCorr()
432 * ( see http://www.newty.de/fpt/callback.html#member )
435 HitList::LogLikelihoodCorr_static(void* pt2hitlist, double* v)
437 HitList* mySelf = (HitList*) pt2hitlist; // explicitly cast to a pointer to Hitlist
438 return mySelf->LogLikelihoodCorr(v); // call member function
441 /////////////////////////////////////////////////////////////////////////////////////
443 * @brief Evaluate the *negative* log likelihood of the data at the vertex v = (lamda,mu)
444 * p(s) = lamda * exp{ -exp[-lamda*(s-mu)] - lamda*(s-mu) } = lamda * exp( -exp(-x) - x)
447 HitList::LogLikelihoodEVD(double* v)
449 double sum=0.0, sumw=0.0;
450 for (int i=0; i<Nprof; i++)
452 double x = v[0]*(score[i]-v[1]);
453 sum += weight[i]*(exp(-x)+x);
456 return sum - sumw*log(v[0]);
460 * @brief Static wrapper-function for calling the nonstatic member function LogLikelihoodEVD()
461 * ( see http://www.newty.de/fpt/callback.html#member )
464 HitList::LogLikelihoodEVD_static(void* pt2hitlist, double* v)
466 HitList* mySelf = (HitList*) pt2hitlist; // explicitly cast to a pointer to Hitlist
467 return mySelf->LogLikelihoodEVD(v); // call member function
470 /////////////////////////////////////////////////////////////////////////////////////
472 * @brief Subroutine to FindMin: try new point given by highest point ihigh and fac and replace ihigh if it is lower
475 HitList::TryPoint(const int ndim, double* p, double* y, double* psum, int ihigh, double fac, double (*Func)(void* pt2hitlist, double* v))
477 // New point p_try = p_c + fac*(p_high-p_c),
478 // where p_c = ( sum_i (p_i) - p_high)/ndim is the center of ndim other points
479 // => p_try = fac1*sum_i(p_i) + fac2*p_high
480 double fac1=(1.-fac)/ndim;
481 double fac2=fac-fac1;
482 double ptry[ndim]; //new point to try out
483 double ytry; //function value of new point
484 int j; //index for the ndim parameters
486 for (j=0; j<ndim; j++)
487 ptry[j]=psum[j]*fac1+p[ihigh*ndim+j]*fac2;
488 ytry = (*Func)(this,ptry);
491 // if (v>=4) printf("Trying: %-7.3f %-7.3f %-7.3f -> accept\n",ptry[0],ptry[1],ytry);
493 for (j=0; j<ndim; j++)
495 psum[j] += ptry[j]-p[ihigh*ndim+j]; //update psum[j]
496 p[ihigh*ndim+j]=ptry[j]; //replace p[ihigh] with ptry
497 } //Note: ihigh is now not highest point anymore!
499 // else if (v>=4) printf("Trying: %-7.3f %-7.3f %-7.3f -> reject\n",ptry[0],ptry[1],ytry);
506 /////////////////////////////////////////////////////////////////////////////////////
508 * @brief Find minimum with simplex method of Nelder and Mead (1965)
511 HitList::FindMin(const int ndim, double* p, double* y, double tol, int& nfunc, double (*Func)(void* pt2hitlist, double* v))
513 const int MAXNFUNC=99; //maximum allowed number of function evaluations
514 int ihigh; //index of highest point on simplex
515 int inext; //index of second highest point on simplex
516 int ilow; //index of lowest point on simplex
517 int i; //index for the ndim+1 points
518 int j; //index for the ndim parameters
519 double rtol; //tolerance: difference of function value between highest and lowest point of simplex
521 double ytry; //function value of trial point
522 double psum[ndim]; //psum[j] = j'th coordinate of sum vector (sum over all vertex vectors)
524 nfunc=0; //number of function evaluations =0
525 //Calculate sum vector psum[j]
526 for (j=0; j<ndim; j++)
529 for (i=1; i<ndim+1; i++)
530 psum[j]+=p[i*ndim+j];
533 // Repeat finding better points in simplex until rtol<tol
536 // Find indices for highest, next highest and lowest point
538 if (y[0]>y[1]) {inext=1; ihigh=0;} else {inext=0; ihigh=1;}
539 for (i=0; i<ndim+1; i++)
541 if (y[i]<=y[ilow]) ilow=i;
542 if (y[i]>y[ihigh]) {inext=ihigh; ihigh=i;}
543 else if (y[i]>y[inext] && i!= ihigh) inext=i;
546 // If tolerance in y is smaller than tol swap lowest point to index 0 and break -> return
547 rtol = 2.*fabs(y[ihigh]-y[ilow]) / (fabs(y[ihigh])+fabs(y[ilow])+1E-10);
550 temp=y[ilow]; y[ilow]=y[0]; y[0]=temp;
551 for (j=0; j<ndim; j++)
553 temp=p[ilow*ndim+j]; p[ilow*ndim+j]=p[j]; p[j]=temp;
558 // Max number of function evaluations exceeded?
559 if (nfunc>=MAXNFUNC )
561 if (v) fprintf(stderr,"\nWARNING: maximum likelihood fit of score distribution did not converge.\n");
566 // Point-reflect highest point on the center of gravity p_c of the other ndim points of the simplex
567 if (v>=3) printf("%3i %-7.3f %-7.3f %-12.8f %-9.3E\n",nfunc,p[ilow*ndim],p[ilow*ndim+1],y[ilow],rtol);
568 // if (v>=2) printf(" %3i %-9.3E %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f\n",nfunc,rtol,p[ilow*ndim],p[ilow*ndim+1],y[ilow],p[inext*ndim],p[inext*ndim+1],y[inext],p[ihigh*ndim],p[ihigh*ndim+1],y[ihigh]);
569 ytry = TryPoint(ndim,p,y,psum,ihigh,-1.0,Func); //reflect highest point on p_c
573 ytry = TryPoint(ndim,p,y,psum,ihigh,2.0,Func); //expand: try new point 2x further away from p_c
574 // if (v>=2) printf("Expanded: %3i %-9.3E %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f\n",nfunc,rtol,p[ilow*ndim],p[ilow*ndim+1],y[ilow],p[inext*ndim],p[inext*ndim+1],y[inext],p[ihigh*ndim],p[ihigh*ndim+1],y[ihigh]);
576 else if (ytry>=y[inext])
578 // The new point is worse than the second worst point
580 ytry=TryPoint(ndim,p,y,psum,ihigh,0.5,Func); //contract simplex by 0.5 along (p_high-p_c
581 // if (v>=2) printf("Compressed:%3i %-9.3E %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f\n",nfunc,rtol,p[ilow*ndim],p[ilow*ndim+1],y[ilow],p[inext*ndim],p[inext*ndim+1],y[inext],p[ihigh*ndim],p[ihigh*ndim+1],y[ihigh]);
584 // Trial point is larger than worst point => contract simplex by 0.5 towards lowest point
585 for (i=0; i<ndim+1; i++)
589 for (j=0; j<ndim; j++)
590 p[i*ndim+j]=0.5*(p[i*ndim+j]+p[ilow+j]);
591 y[i] = (*Func)(this,p+i*ndim);
592 // y[i] = (*Func)(p+i*ndim);
596 // if (v>=2) printf("Contracted:%3i %-9.3E %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f\n",nfunc,rtol,p[ilow*ndim],p[ilow*ndim+1],y[ilow],p[inext*ndim],p[inext*ndim+1],y[inext],p[ihigh*ndim],p[ihigh*ndim+1],y[ihigh]);
599 for (j=0; j<ndim; j++)
602 for (i=1; i<ndim+1; i++)
603 psum[j]+=p[i*ndim+j];
614 /////////////////////////////////////////////////////////////////////////////////////
616 * @brief Do a maximum likelihod fit of the scores with an EV distribution with parameters lamda and mu
619 HitList::MaxLikelihoodEVD(HMM& q, int nbest)
621 double tol=1E-6; // Maximum relative tolerance when minimizing -log(P)/N (~likelihood)
622 static char first_call=1;
623 static Hash<int> size_fam(MAXPROF/10); // Hash counts number of HMMs in family
624 static Hash<int> size_sfam(MAXPROF/10); // Hash counts number of families in superfamily
625 Hash<int> excluded(50); // Hash containing names of superfamilies to be excluded from fit
626 size_fam.Null(0); // Set int value to return when no data can be retrieved
627 size_sfam.Null(0); // Set int value to return when no data can be retrieved
628 excluded.Null(0); // Set int value to return when no data can be retrieved
631 double mu; // EVD[mu,lam](x) = exp(-exp(-(x-mu)/lam)) = P(score<=x)
632 double vertex[2*3]; // three vertices of the simplex in lamda-mu plane
633 double yvertex[3]; // log likelihood values at the three vertices of the simplex
634 int nfunc=0; // number of function calls
635 double sum_weights=0.0;
636 float sum_scores=0.0;
642 // Count how many HMMs are in each family; set number of multiple hits per template nrep
643 if (v>=4) printf(" count number of profiles in each family and families in each superfamily ...\n");
648 if (!size_fam.Contains(hit.fam)) (*size_sfam(hit.sfam))++; //Add one to hash element for superfamily
649 (*size_fam(hit.fam))++; //Add one to hash element for family
650 // printf("size(%s)=%i name=%s\n",hit.fam,*size_fam(hit.fam),hit.name)
652 fams=size_fam.Size();
653 sfams=size_sfam.Size();
655 printf("%-3i HMMs from %i families and %i superfamilies searched. Found %i hits\n",N_searched,fams,sfams,Size());
658 // Query has SCOP family identifier?
659 if (q.fam && q.fam[0]>='a' && q.fam[0]<='k' && q.fam[1]=='.')
661 char sfamid[NAMELEN];
662 char* ptr_in_fam=q.fam;
663 while ((ptr_in_fam=strwrd(sfamid,ptr_in_fam,'-')))
665 char* ptr=strrchr(sfamid,'.');
667 excluded.Add(sfamid);
668 // fprintf(stderr,"Exclude SCOP superfamily %s ptr_in_fam='%s'\n",sfamid,ptr_in_fam);
671 // Exclude best superfamilies from fit
674 if (sfams<97+nbest) return;
676 // Find the nbest best-scoring superfamilies for exclusion from first ML fit
677 if (v>=4) printf(" find %i best-scoring superfamilies to exclude from first fit ...\n",nbest);
679 excluded.Add(hit.sfam);
680 // printf("Exclude in first round: %s %8.2f %s\n",hit.name,hit.score_aass,hit.sfam);
681 while (excluded.Size()<nbest)
684 while (!End() && excluded.Contains(ReadNext().sfam)) ;
688 if (ReadNext()<hit && !excluded.Contains(ReadCurrent().sfam))
691 excluded.Add(hit.sfam);
692 // printf("Exclude in first round: %s %8.2f %s %i %i\n",hit.name,hit.score_aass,hit.sfam,excluded.Size(),excluded.Contains(hit.sfam));
694 tol = 0.01/size_sfam.Size(); // tol=1/N would lead to delta(log-likelihood)~1 (where N ~ number of superfamilies) since (1+1/N)^N = e
698 // Find the best-scoring superfamilies from first fit for exclusion from second ML fit
699 if (v>=4) printf(" find best-scoring superfamilies to exclude from second fit ...\n");
704 if (hit.Eval < 0.05) excluded.Add(hit.sfam); // changed from 0.5 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
706 tol = 0.001/size_sfam.Size(); // tol=1/N would lead to delta(log-likelihood)~1 (where N ~ number of superfamilies) since (1+1/N)^N = e
709 // Put scores into score[] and weights into weight[]
710 if (v>=3) printf(" generate scores and weights array for ML fitting ...\n");
716 if (hit.irep > 1) continue; //Use only best hit per template
717 if (Nprof>=MAXPROF) break;
719 char sfamid[NAMELEN];
720 char* ptr_in_fam=hit.fam;
721 while ((ptr_in_fam=strwrd(sfamid,ptr_in_fam,'-')))
723 char* ptr=strrchr(sfamid,'.');
725 if (excluded.Contains(sfamid)) break; //HMM is among superfamilies to be excluded
727 if (excluded.Contains(sfamid)) {
728 if (v>=3) fprintf(stderr,"Exclude hit %s (family %s contains %s)\n",hit.name,hit.fam,sfamid);
731 // ScopID(hit.cl,hit.fold,hit.sfam,hit.fam); //Get scop superfamily code for template
732 // if (*hit.sfam=='\0' || excluded.Contains(hit.sfam)) continue; // skip HMM
734 score[Nprof] = hit.score;
735 weight[Nprof]=1./size_fam[hit.fam]/size_sfam[hit.sfam];
736 sum_scores +=hit.score*weight[Nprof];
737 sum_weights+=weight[Nprof];
740 // if (v>=4) printf("%-10.10s %-12.12s %-3i %-12.12s %-3i %6.4f %6.4f %7.1f\n",hit.name,hit.fam,size_fam[hit.fam],hit.sfam,size_sfam[hit.sfam],1./size_fam[hit.fam]/size_sfam[hit.sfam],sum,hit.score);
745 printf("%i hits used for score distribution\n",Nprof);
746 // for (int i=0; i<Nprof; i++) printf("%3i score=%8.3f weight=%7.5f\n",i,score[i],weight[i]);
748 // Set simplex vertices and function values
749 mu = sum_scores/sum_weights - 0.584/LAMDA;
750 if (par.loc) // fit only in local mode; in global mode use fixed value LAMDA and mu mean score
752 double (*Func)(void*, double*);
753 Func = HitList::LogLikelihoodEVD_static;
755 if (nbest>0) {vertex[0]=LAMDA; vertex[1]=mu;} /////////////////////////////////////////// DEBUG
756 else {vertex[0]=q.lamda; vertex[1]=mu;}
757 vertex[2]=vertex[0]+0.1; vertex[3]=vertex[1];
758 vertex[4]=vertex[0]; vertex[5]=vertex[1]+0.2;
759 yvertex[0]=Func(this,vertex );
760 yvertex[1]=Func(this,vertex+2);
761 yvertex[2]=Func(this,vertex+4);
763 // Find lam and mu that minimize negative log likelihood of data
764 if (v>=3) printf("Fitting to EVD by maximum likelihood...\niter lamda mu -log(P)/N tol\n");
765 rtol = FindMin(2,vertex,yvertex,tol,nfunc,Func);
766 if (v>=3) printf("%3i %-7.3f %-7.2f %-7.3f %-7.1E\n\n",nfunc,vertex[0],vertex[1],yvertex[0]-(1.5772-log(vertex[0])),rtol);
767 // printf("HHsearch lamda=%-6.3f mu=%-6.3f\n",vertex[0],vertex[1]);
771 vertex[0]=LAMDA_GLOB; vertex[1]=mu;
774 // Set lamda and mu of profile
778 // Set P-values and E-values
779 // CHECK UPDATE FROM score=-logpval to score=-logpval+SSSCORE2NATLOG*score_ss !!!!
785 // Calculate total score in raw score units: P-value = 1- exp(-exp(-lamda*(Saa-mu)))
786 hit.weight=1./size_fam[hit.fam]/size_sfam[hit.sfam]; // needed for transitive scoring
787 hit.logPval = logPvalue(hit.score,vertex);
788 hit.Pval=Pvalue(hit.score,vertex);
789 hit.Eval=exp(hit.logPval+log(N_searched));
790 // hit.score_aass = hit.logPval/0.45-3.0 - hit.score_ss; // median(lamda)~0.45, median(mu)~4.0 in EVDs for scop20.1.63 HMMs
791 hit.score_aass = -q.lamda*(hit.score-q.mu)/0.45-3.0 - fmin(hit.score_ss,fmax(0.0,0.5*hit.score-5.0)); // median(lamda)~0.45, median(mu)~3.0 in EVDs for scop20.1.63 HMMs
792 hit.Probab = Probab(hit);
793 if (nbest>0 && par.loc) // correct length correction (not needed for second round of fitting, since lamda very similar)
794 if (par.idummy==0) ////////////////////////////////////////////
795 hit.score += log(q.L*hit.L)*(1/LAMDA-1/vertex[0]);
796 hit.score_sort = hit.score_aass;
797 Overwrite(hit); // copy hit object into current position of hitlist
799 if (nbest==0 && par.trans==1) // if in transitive scoring mode (weights file given)
801 else if (nbest==0 && par.trans==2) // if in transitive scoring mode (weights file given)
802 TransitiveScoring2();
803 else if (nbest==0 && par.trans==3) // if in transitive scoring mode (weights file given)
804 TransitiveScoring3();
805 else if (nbest==0 && par.trans==4) // if in transitive scoring mode (weights file given)
806 TransitiveScoring4();
811 /////////////////////////////////////////////////////////////////////////////////////
813 * @brief Calculate correlation and score offset for HHblast composite E-values
816 HitList::CalculateHHblastCorrelation(HMM& q)
818 int nfunc=0; // number of function calls
819 double tol; // Maximum relative tolerance when minimizing -log(P)/N (~likelihood)
820 double vertex[2*3]; // three vertices of the simplex in lamda-mu plane
821 double yvertex[3]; // log likelihood values at the three vertices of the simplex
823 Hash<int> excluded(50); // Hash containing names of superfamilies to be excluded from fit
824 excluded.Null(0); // Set int value to return when no data can be retrieved
826 // Set sum of HHsearch and PSI-BLAST score for calculating correlation
831 hit.score_sort = hit.logPval + blast_logPvals->Show(hit.name); // if template not in hash, return log Pval = 0, i.e. Pvalue = 1!
832 Overwrite(hit); // copy hit object into current position of hitlist
835 // Query has SCOP family identifier?
836 if (q.fam && q.fam[0]>='a' && q.fam[0]<='k' && q.fam[1]=='.')
838 char sfamid[NAMELEN];
839 char* ptr_in_fam=q.fam;
840 while ((ptr_in_fam=strwrd(sfamid,ptr_in_fam,'-')))
842 char* ptr=strrchr(sfamid,'.');
844 excluded.Add(sfamid);
845 fprintf(stderr,"Exclude SCOP superfamily %s ptr_in_fam='%s'\n",sfamid,ptr_in_fam);
849 // Resort list by sum of log P-values
850 ResortList(); // use InsertSort to resort list according to sum of minus-log-Pvalues
853 ReadNext(); // skip best hit
857 if (hit.irep>=2) continue; // use only best alignments
858 // if (hit.Eval<0.005) {if (v>=3) printf("Fitting HHblast correlation coefficient: skipping %s with Evalue=%9.1g\n",hit.name,hit.Eval); continue;}
859 if (Nprof>=MAXPROF) break;
861 char sfamid[NAMELEN];
862 char* ptr_in_fam=hit.fam;
863 while ((ptr_in_fam=strwrd(sfamid,ptr_in_fam,'-')))
865 char* ptr=strrchr(sfamid,'.');
867 if (excluded.Contains(sfamid)) break; //HMM is among superfamilies to be excluded
869 if (excluded.Contains(sfamid)) {
870 if (v>=1) fprintf(stderr,"Exclude hit %s (family %s contains %s)\n",hit.name,hit.fam,sfamid);
873 score[Nprof] = -hit.score_sort;
874 weight[Nprof] = 1.0; // = hit.weight;
875 // printf("%3i %-12.12s %7.3f + %7.3f = %7.3f \n",Nprof,hit.name,hit.logPval,blast_logPvals->Show(hit.name),-hit.score_sort); //////////////////////
876 printf("%3i %7.3f %7.3f\n",Nprof,hit.Pval,exp(blast_logPvals->Show(hit.name))); //////////////////////
881 vertex[0]=0.5; vertex[1]=0.2;
882 vertex[2]=vertex[0]+0.2; vertex[3]=vertex[1];
883 vertex[4]=vertex[0]; vertex[5]=vertex[1]+0.2;
885 yvertex[0]=RankOrderFitCorr(vertex );
886 yvertex[1]=RankOrderFitCorr(vertex+2);
887 yvertex[2]=RankOrderFitCorr(vertex+4);
888 // yvertex[0]=LogLikelihoodCorr(vertex );
889 // yvertex[1]=LogLikelihoodCorr(vertex+2);
890 // yvertex[2]=LogLikelihoodCorr(vertex+4);
892 v=3;//////////////////////////////////
893 // Find correlation and offset that minimize mean square deviation of reported composite Pvalues from actual
894 if (v>=2) printf("Fitting correlation coefficient for HHblast...\niter corr offset logLikelihood tol\n");
895 float rtol = FindMin(2,vertex,yvertex,tol,nfunc, HitList::RankOrderFitCorr_static);
896 if (v>=2) printf("%3i %-7.3f %-7.2f %-7.3f %-7.1E\n\n",nfunc,vertex[0],vertex[1],yvertex[0],rtol);
897 if (vertex[0]<0) vertex[0]=0.0;
899 // Print correlation and offset for profile
900 printf("HHblast correlation=%-6.3f score offset=%-6.3f\n",vertex[0],vertex[1]);
901 v=2;//////////////////////////////////
906 * @brief Calculate HHblast composite E-values
909 corr_HHblast(float Nq, float Nt)
915 * @brief Calculate HHblast composite E-values
918 offset_HHblast(float Nq, float Nt)
923 //////////////////////////////////////////////////////////////////////////////
925 * @brief Calculate HHblast composite E-values
928 HitList::CalculateHHblastEvalues(HMM& q)
931 float corr, offset; // correlation coefficient and offset for calculating composite HHblast P-values
937 corr = corr_HHblast(q.Neff_HMM,hit.Neff_HMM);
938 offset = offset_HHblast(q.Neff_HMM,hit.Neff_HMM);
939 hit.score_sort = hit.logPval + blast_logPvals->Show(hit.name);
940 hit.logPval = logPvalue_HHblast(-hit.score_sort+offset,corr); // overwrite logPval from HHsearch with composite logPval from HHblast
941 hit.Pval = Pvalue_HHblast(-hit.score_sort+offset,corr); // overwrite P-value from HHsearch with composite P-value from HHblast
942 hit.Eval = exp(hit.logPval+log(N_searched)); // overwrite E-value from HHsearch with composite E-value from HHblast
943 hit.Probab = Probab(hit);
944 Overwrite(hit); // copy hit object into current position of hitlist
946 ResortList(); // use InsertSort to resort list according to sum of minus-log-Pvalues
950 //////////////////////////////////////////////////////////////////////////////
952 * @brief Read file generated by blastpgp (default output) and store P-values in hash
955 HitList::ReadBlastFile(HMM& q)
957 char line[LINELEN]=""; // input line
958 int Ndb; // number of sequences in database
959 int Ldb=0; // size of database in number of amino acids
962 if (!blast_logPvals) { blast_logPvals = new(Hash<float>); blast_logPvals->New(16381,0); }
965 if (!strcmp(par.blafile,"stdin")) blaf=stdin;
968 blaf = fopen(par.blafile,"rb");
969 if (!blaf) OpenFileError(par.blafile);
972 // Read number of sequences and size of database
973 while (fgetline(line,LINELEN-1,blaf) && !strstr(line,"sequences;"));
974 if (!strstr(line,"sequences;")) FormatError(par.blafile,"No 'Database:' string found.");
977 if (Ndb==INT_MIN) FormatError(par.blafile,"No integer for number of sequences in database found.");
978 while ((i=strint(ptr))>INT_MIN) Ldb = 1000*Ldb + i;
979 if (Ldb==0) FormatError(par.blafile,"No integer for size of database found.");
980 printf("\nNumber of sequences in database = %i Size of database = %i\n",Ndb,Ldb);
982 // Read all E-values and sequence lengths
983 while (fgetline(line,LINELEN-1,blaf))
987 // Read template name
988 templ = new(char[255]);
991 if (!blast_logPvals->Contains(templ)) // store logPval only for best HSP with template
994 while (fgetline(line,LINELEN-1,blaf) && !strstr(line,"Length ="));
996 int length = strint(ptr);
998 fgetline(line,LINELEN-1,blaf);
999 fgetline(line,LINELEN-1,blaf);
1000 float EvalDB; // E-value[seq-db] = Evalue for comparison Query vs. database, from PSI-BLAST
1001 float EvalQT; // E-value[seq-seq] = Evalue for comparison Query vs. template (seq-seq)
1003 ptr = strstr(line+20,"Expect =");
1004 if (!ptr) FormatError(par.blafile,"No 'Expect =' string found.");
1005 if (sscanf(ptr+8,"%g",&EvalDB)<1)
1008 if (sscanf(ptr+7,"%g",&EvalDB)<1)
1009 FormatError(par.blafile,"No Evalue found after 'Expect ='.");
1011 // Calculate P-value[seq-seq] = 1 - exp(-E-value[seq-seq]) = 1 - exp(-Lt/Ldb*E-value[seq-db])
1012 EvalQT = length/double(Ldb)*double(EvalDB);
1013 if (EvalQT>1E-3) logPval = log(1.0-exp(-EvalQT)); else logPval=log(double(EvalQT)+1.0E-99);
1014 blast_logPvals->Add(templ,logPval);
1015 printf("template=%-10.10s length=%-3i EvalDB=%8.2g EvalQT=%8.2g P-value=%8.2g log Pval=%8.2g\n",templ,length,EvalDB,EvalQT,exp(logPval),logPval);
1018 delete[] templ; templ = NULL;
1026 /////////////////////////////////////////////////////////////////////////////////////
1028 * @brief Calculate output of hidden neural network units
1031 calc_hidden_output(const float* weights, const float* bias, float Lqnorm, float Ltnorm, float Nqnorm, float Ntnorm)
1034 // Calculate activation of hidden unit = sum of all inputs * weights + bias
1035 res = Lqnorm*weights[0] + Ltnorm*weights[1] + Nqnorm*weights[2] + Ntnorm*weights[3] + *bias;
1036 res = 1.0 / (1.0 + exp(-(res ))); // logistic function
1040 ////////////////////////////////////////////////////////////////////////////////////
1042 * @brief Neural network regressions of lamda for EVD
1045 lamda_NN(float Lqnorm, float Ltnorm, float Nqnorm, float Ntnorm)
1047 const int inputs = 4;
1048 const int hidden = 4;
1049 const float biases[] = {-0.73195, -1.43792, -1.18839, -3.01141}; // bias for all hidden units
1050 const float weights[] = { // Weights for the neural networks (column = start unit, row = end unit)
1051 -0.52356, -3.37650, 1.12984, -0.46796,
1052 -4.71361, 0.14166, 1.66807, 0.16383,
1053 -0.94895, -1.24358, -1.20293, 0.95434,
1054 -0.00318, 0.53022, -0.04914, -0.77046,
1055 2.45630, 3.02905, 2.53803, 2.64379
1058 for (int h = 0; h<hidden; h++) {
1059 lamda += calc_hidden_output( weights+inputs*h, biases+h, Lqnorm,Ltnorm,Nqnorm,Ntnorm ) * weights[hidden*inputs+h];
1064 ////////////////////////////////////////////////////////////////////////////////////
1066 * @brief Neural network regressions of mu for EVD
1069 mu_NN(float Lqnorm, float Ltnorm, float Nqnorm, float Ntnorm)
1071 const int inputs = 4;
1072 const int hidden = 6;
1073 const float biases[] = {-4.25264, -3.63484, -5.86653, -4.78472, -2.76356, -2.21580}; // bias for all hidden units
1074 const float weights[] = { // Weights for the neural networks (column = start unit, row = end unit)
1075 1.96172, 1.07181, -7.41256, 0.26471,
1076 0.84643, 1.46777, -1.04800, -0.51425,
1077 1.42697, 1.99927, 0.64647, 0.27834,
1078 1.34216, 1.64064, 0.35538, -8.08311,
1079 2.30046, 1.31700, -0.46435, -0.46803,
1080 0.90090, -3.53067, 0.59212, 1.47503,
1081 -1.26036, 1.52812, 1.58413, -1.90409, 0.92803, -0.66871
1084 for (int h = 0; h<hidden; h++) {
1085 mu += calc_hidden_output( weights+inputs*h, biases+h, Lqnorm,Ltnorm,Nqnorm,Ntnorm ) * weights[hidden*inputs+h];
1090 //////////////////////////////////////////////////////////////////////////////
1092 * @brief Calculate Pvalues as a function of query and template lengths and diversities
1095 HitList::CalculatePvalues(HMM& q)
1098 float lamda=0.4, mu=3.0;
1099 const float log1000=log(1000.0);
1103 printf("WARNING: idummy should have been ==2 (no length correction)\n");
1107 if(N_searched==0) N_searched=1;
1109 printf("Calculate Pvalues as a function of query and template lengths and diversities...\n");
1117 lamda = lamda_NN( log(q.L)/log1000, log(hit.L)/log1000, q.Neff_HMM/10.0, hit.Neff_HMM/10.0 );
1118 mu = mu_NN( log(q.L)/log1000, log(hit.L)/log1000, q.Neff_HMM/10.0, hit.Neff_HMM/10.0 );
1119 // if (v>=3 && nhits++<20)
1120 // printf("hit=%-10.10s Lq=%-4i Lt=%-4i Nq=%5.2f Nt=%5.2f => lamda=%-6.3f mu=%-6.3f\n",hit.name,q.L,hit.L,q.Neff_HMM,hit.Neff_HMM,lamda,mu);
1124 printf("WARNING: global calibration not yet implemented!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
1126 hit.logPval = logPvalue(hit.score,lamda,mu);
1127 hit.Pval = Pvalue(hit.score,lamda,mu);
1128 hit.Eval=exp(hit.logPval+log(N_searched));
1129 // hit.score_aass = hit.logPval/LAMDA-3.0 - hit.score_ss; // median(lamda)~0.45, median(mu)~3.0 in EVDs for scop20.1.63 HMMs
1130 // P-value = 1- exp(-exp(-lamda*(Saa-mu))) => -lamda*(Saa-mu) = log(-log(1-Pvalue))
1131 hit.score_aass = (hit.logPval<-10.0? hit.logPval : log(-log(1-hit.Pval)) )/0.45 - fmin(lamda*hit.score_ss,fmax(0.0,0.2*(hit.score-8.0)))/0.45 - 3.0;
1132 hit.score_sort = hit.score_aass;
1133 hit.Probab = Probab(hit);
1141 //////////////////////////////////////////////////////////////////////////////
1143 * @brief Calculate Pvalues from calibration of 0: query HMM, 1:template HMMs, 2: both
1146 HitList::GetPvalsFromCalibration(HMM& q)
1150 if(N_searched==0) N_searched=1;
1156 printf("Using lamda=%-5.3f and mu=%-5.2f from calibrated query HMM %s. \n",q.lamda,q.mu,q.name);
1157 printf("Note that HMMs need to be recalibrated when changing HMM-HMM alignment options.\n");
1160 printf("Using score distribution parameters lamda and mu from database HMMs \n");
1163 printf("Combining score distribution parameters lamda and mu from query and database HMMs\n");
1164 printf("Note that HMMs need to be recalibrated when changing HMM-HMM alignment options.\n");
1172 if (par.calm==0 || (hit.logPvalt==0) )
1174 hit.logPval = logPvalue(hit.score,q.lamda,q.mu);
1175 hit.Pval = Pvalue(hit.score,q.lamda,q.mu);
1176 if (par.calm>0 && warn++<1 && v>=1)
1177 printf("Warning: some template HMM (e.g. %s) are not calibrated. Using query calibration.\n",hit.name);
1179 else if (par.calm==1)
1181 hit.logPval = hit.logPvalt;
1182 hit.Pval = hit.Pvalt;
1184 else if (par.calm==2)
1186 hit.logPval = 0.5*( logPvalue(hit.score,q.lamda,q.mu) + hit.logPvalt);
1187 hit.Pval = sqrt( Pvalue(hit.score,q.lamda,q.mu) * hit.Pvalt);
1188 if (v>=5) printf("Score: %7.1f lamda: %7.1f mu: %7.1f P-values: query-calibrated: %8.2G template-calibrated: %8.2G geometric mean: %8.2G\n",hit.score,q.lamda,q.mu,Pvalue(hit.score,q.lamda,q.mu),hit.Pvalt,hit.Pval);
1191 hit.Eval=exp(hit.logPval+log(N_searched));
1192 // hit.score_aass = hit.logPval/LAMDA-3.0 - hit.score_ss; // median(lamda)~0.45, median(mu)~3.0 in EVDs for scop20.1.63 HMMs
1193 // P-value = 1- exp(-exp(-lamda*(Saa-mu))) => -lamda*(Saa-mu) = log(-log(1-Pvalue))
1194 hit.score_aass = (hit.logPval<-10.0? hit.logPval : log(-log(1-hit.Pval)) ) / 0.45-3.0 - fmin(hit.score_ss,fmax(0.0,0.5*hit.score-5.0));
1195 hit.score_sort = hit.score_aass;
1196 hit.Probab = Probab(hit);
1212 //////////////////////////////////////////////////////////////////////////////
1213 //////////////////////////////////////////////////////////////////////////////
1214 //////////////////////////////////////////////////////////////////////////////
1215 // Transitive scoring
1216 //////////////////////////////////////////////////////////////////////////////
1217 //////////////////////////////////////////////////////////////////////////////
1218 //////////////////////////////////////////////////////////////////////////////
1226 /////////////////////////////////////////////////////////////////////////////////////
1228 * @brief Calculate P-values and Probabilities from transitive scoring over whole database
1231 HitList::TransitiveScoring()
1233 void PrintMatrix(float** V, int N);
1234 void PrintMatrix(double** V, int N);
1236 float** Z; // matrix of intra-db Z-scores Z_kl
1237 float** C; // covariance matrix for Z_k: C_kl = sum_m=1^N (Z_km * Z_lm)
1238 char** fold; // fold name of HMM k
1239 char** fam; // family of HMM k
1240 float* Prob; // probability of HMM k
1241 float* Zq; // Zq[k] = Z-score between query and database HMM k
1242 float* Ztq; // Ztq[k] = transitive Z-score from query to database HMM k: Ztq[k] = sum_l[ w_ql * Z_lk] / normalization_q
1243 float* Zrq; // Zrq[k] = transitive Z-score from database HMM k to query: Zrq[k] = sum_l[ w_kl * Z_lq] / normalization_k
1244 float* w; // unnormalized weight matrix; w[l] is w_ql or w_kl, respectively
1245 int* ll; // ll[m] is the m'th index l for which Z_lq, Z_lk > Zmin_trans
1246 int N; // dimension of weight matrix is NxN
1247 int M; // number of HMMs l with Z_ql>Ztrans_min (or Z_lk>Ztrans_min, respectively)
1248 int k,l,m,n; // indices for database HMMs
1250 Hash<int> index(MAXPROF+7); // index{name} = index of HMM name in {1,...,N}
1251 index.Null(-1); // Set int value to return when no data can be retrieved
1252 Hash<int> excluded(13); // Hash containing names of superfamilies to be excluded from fit
1253 excluded.Null(0); // Set int value to return when no data can be retrieved
1255 size_t unused; /* disable fread gcc warning */
1257 // Read weights matrix W with index hash and names array
1258 fprintf(stderr,"Reading in weights file\n");
1259 FILE* wfile = fopen(par.wfile,"rb");
1260 if (v>=1 && wfile==NULL)
1262 fprintf(stderr,"Error: %s could not be opened: (N_searched=%i) ",par.wfile,N_searched);
1264 fprintf(stderr,"Skipping caclulation of transitive P-values\n");
1268 unused = fread(&N,sizeof(int),1,wfile); // read matrix dimension (i.e. number of HMMs in database)
1269 if (v>=1 && N!=N_searched)
1271 fprintf(stderr,"Error: Number %i of HMMs in weight file is different from number %i of HMMs in searched databases. \n",N,N_searched);
1272 fprintf(stderr,"Skipping caclulation of transitive P-values\n");
1276 if (v>=2) fprintf(stderr,"Calculating transitive P-values for %i HMMs\n",N);
1277 // Read names of HMMs (to specify mapping of HMM to matrix indices)
1280 unused = fread(name,sizeof(char),IDLEN,wfile);
1283 // Read symmetric Z-scores matrix
1287 Z[k] = new(float[N]);
1288 for (l=0; l<k; l++) Z[k][l] = Z[l][k];
1289 unused = fread(Z[k]+k,sizeof(float),N-k,wfile);
1291 // Read symmetric covariance matrix
1295 C[k] = new(float[N]);
1296 for (l=0; l<k; l++) C[k][l] = C[l][k];
1297 unused = fread(C[k]+k,sizeof(float),N-k,wfile);
1303 Ztq = new(float[N]);
1304 Zrq = new(float[N]);
1305 fold = new(char*[N]);
1306 fam = new(char*[N]);
1307 Prob = new(float[N]);
1311 // Transform P-values to normally distributed Z-scores and store in Zq vector
1312 fprintf(stderr,"Transform P-values to Z-scores\n");
1313 float Zmax_neg = Score2Z( -log(MINEVALEXCL) + log(N_searched) ); // calculate Z-score corresponding to E-value MINEVALEXCL
1314 float Zmin_trans = Score2Z( -log(par.Emax_trans) + log(N_searched) ); // calculate Z-score corresponding to E-value par.Emax_trans
1315 printf("Zmax = %6.2f Zmin = %6.2f \n",Zmax_neg,Zmin_trans);
1321 if (hit.irep>1) continue;
1322 k = index.Show(hit.name);
1323 if (k<0) {fprintf(stderr,"Error: no index found in weights file for domain %s\n",hit.name); exit(1);}
1325 Zq[k] = 0.5*Score2Z(fabs(hit.logPval)) + 0.5*Score2Z(fabs(hit.logPvalt)); // Zq[k] = 0.5*(Zkq + Zqk)
1327 Zq[k] = Score2Z(fabs(hit.logPval)); // Zq[k] = Zqk
1328 // printf("%4i %-10.10s logPvalt=%9g Zq=%9f\n",k,hit.name,hit.logPvalt,Zq[k]);
1329 // if (isnan(Zq[k])) {
1330 // fprintf(stderr,"Error: a floating point exception occurred. Skipping transitive scoring\n");
1331 // printf("%4i %-10.10s logPval=%9g logPvalt=%9g Zq=%9f\n",k,hit.name,hit.logPval,hit.logPvalt,Zq[k]);
1335 if (Zq[k]>Zmax_neg) excluded.Add(hit.fold);
1336 fold[k] = new(char[IDLEN]);
1337 fam[k] = new(char[IDLEN]);
1338 strcpy(fold[k],hit.fold);
1339 strcpy(fam[k],hit.fam);
1340 weight[k] = hit.weight;
1341 Prob[k] = hit.Probab;
1347 while (!excluded.End())
1349 excluded.ReadNext(name);
1350 printf("Excluded fold %s from fitting to Ztq\n",name);
1355 ////////////////////////////////////////////////////////////////
1356 // Calculate transitive score (query->l) Zt[l]
1358 // Construct vector ll of indices l for which Z_lq > Zmin_trans
1361 if (Zq[l]>=Zmin_trans) ll[m++]=l;
1362 M = m; // number of indices l for which Z_lq,Z_lk > Zmin_trans
1364 // for (m=0; m<M; m++)
1365 // fprintf(stderr,"m=%-4i l=%-4i %-10.10s Zq[l]=%7f\n",m,ll[m],fam[ll[m]],Zq[ll[m]]);
1368 for (k=0; k<N; k++) Ztq[k]=0.0;
1371 // Generate submatrix of C for indices l for which Z_lq,Z_lk > Zmin_trans
1372 double** Csub = new(double*[M]);
1373 double** Cinv = new(double*[M]);
1376 Csub[m] = new(double[M]);
1377 Cinv[m] = new(double[M]);
1379 Csub[m][n] = double(C[ll[m]][ll[n]]);
1384 fprintf(stderr,"Covariance matrix\n");
1385 PrintMatrix(Csub,M);
1389 fprintf(stderr,"Calculate inverse of covariance submatrix\n");
1390 InvertMatrix(Cinv,Csub,M);
1394 fprintf(stderr,"Inverse covariance matrix\n");
1395 PrintMatrix(Cinv,M);
1398 // Calculate weights w[l]
1403 sum += 1.0 * Cinv[m][n];
1404 w[m] = fmax(sum,0.0);
1406 for (l=0; l<M; l++){
1407 delete[](Cinv[l]); (Cinv[l]) = NULL;
1409 delete[](Cinv); (Cinv) = NULL;
1411 // Calculate Ztq[k] for all HMMs k
1412 fprintf(stderr,"Calculate Ztq vector of transitive Z-scores\n");
1413 float norm = NormalizationFactor(Csub,w,M);
1418 sumZ += w[m] * Z[ll[m]][k];
1422 for (l=0; l<M; l++){
1423 delete[](Csub[l]); (Csub[l]) = NULL;
1425 delete[](Csub); (Csub) = NULL;
1428 ////////////////////////////////////////////////////////////////
1429 // Calculate reverse transitive score (l->query-) Zrq[l]
1431 fprintf(stderr,"Calculate Zrq vector of transitive Z-scores\n");
1434 // Construct vector ll of indices l for which Z_lk > Zmin_tran
1437 if (Z[l][k]+Z[k][l]>=2*Zmin_trans) ll[m++]=l;
1438 int M = m; // number of indices l for which Z_lq,Z_lk > Zmin_tran
1441 // fprintf(stderr,"\nfam[k]: %s\n",fam[k]);
1442 // for (m=0; m<M; m++)
1443 // printf(stderr,"m=%-4i k=%-4i l=%-4i %-10.10s Zq[l]=%7f Z_lk=%7f \n",m,k,ll[m],fold[ll[m]],Zq[ll[m]],Z[k][ll[m]]);
1451 // Generate submatrix of C for indices l for which Z_lq,Z_lk > Zmin_trans
1452 double** Csub = new(double*[M]);
1455 Csub[m] = new(double[M]);
1457 Csub[m][n] = double(C[ll[m]][ll[n]]);
1459 // fprintf(stderr,"Covariance matrix\n");
1460 // PrintMatrix(Csub,M);
1464 for (m=0; m<M; m++) w[m] = 1.0/M;
1469 double** Cinv = new(double*[M]);
1470 for (m=0; m<M; m++) Cinv[m] = new(double[M]);
1473 InvertMatrix(Cinv,Csub,M);
1475 // fprintf(stderr,"Inverse covariance matrix\n");
1476 // PrintMatrix(Cinv,M);
1478 // Calculate weights w[l]
1483 sum += 1.0 * Cinv[m][n];
1484 w[m] = fmax(sum,0.0);
1487 // for (m=0; m<M; m++) fprintf(stderr,"w[%i]=%8.2g\n",m,w[m]);
1489 for (l=0; l<M; l++){
1490 delete[](Cinv[l]); (Cinv[l]) = NULL;
1492 delete[](Cinv); (Cinv) = NULL;
1495 // Calculate Zrq[k] and normalize
1496 float norm = NormalizationFactor(Csub,w,M);
1499 sumZ += w[m] * Zq[ll[m]];
1502 for (l=0; l<M; l++){
1503 delete[](Csub[l]); (Csub[l]) = NULL;
1505 delete[](Csub); (Csub) = NULL;
1508 // fprintf(stderr,"\nZq[k]=%8.2g Zq1[k]=%8.2g\n",Zq[k],Zrq[k]);
1511 // Total Z-score = weighted sum over original Z-score, forward transitive and reverse transitive Z-score
1514 float Zqtot = Zq[k] + par.wtrans*(Ztq[k]+Zrq[k]);
1515 // if (isnan(Zqtot))
1517 // fprintf(stderr,"Error: a floating point exception occurred. Skipping transitive scoring\n");
1518 // printf("%4i %-10.10s Zq=%6.2f Ztq=%6.2f Zrq=%6.2f Zqtot=%6.2f\n",k,fam[k],Zq[k],Ztq[k],Zrq[k],Zqtot);
1522 if (v>=2 && Zq[k] + Zqtot > 2*Zmin_trans) {
1523 printf("%4i %-10.10s Zq=%6.2f Ztq=%6.2f Zrq=%6.2f -> Zqtot=%6.2f\n",k,fam[k],Zq[k],Ztq[k],Zrq[k],Zqtot);
1528 // Calculate mean and standard deviation of Z1q
1529 fprintf(stderr,"Calculate mean and standard deviation of Ztq\n");
1535 if (excluded.Contains(fold[k])) continue;
1537 sumZ += weight[k]*Ztq[k];
1538 sumZ2 += weight[k]*Ztq[k]*Ztq[k];
1541 // fprintf(stderr,"Error: a floating point exception occurred. Skipping transitive scoring\n");
1542 // printf("%4i %-10.10s Zq=%9f Zrq=%9f Ztq=%9f\n",k,fam[k],Zq[k],Zrq[k],Ztq[k]);
1547 float mu = sumZ/sumw;
1548 float sigma = sqrt(sumZ2/sumw-mu*mu);
1549 if (v>=2) printf("mu(Ztq)=%6.3f sigma(Ztq)=%6.2f\n",mu,sigma);
1550 sigma *= 1.01;// correct different fitting of EVD and normal variables
1552 // Normalize Ztq and calculate P1-values
1553 fprintf(stderr,"Normalize Ztq and calculate P1-values\n");
1558 hit.logPval = -Z2Score((Ztq[index.Show(hit.name)]-mu)/sigma);
1559 hit.E1val = N_searched*(hit.logPval<-100.0? 0.0 : exp(hit.logPval));
1560 // P-value = 1- exp(-exp(-lamda*(Saa-mu))) => -lamda*(Saa-mu) = log(-log(1-Pvalue))
1561 hit.score_aass = (hit.logPval<-10.0? hit.logPval : log(-log(1-exp(hit.logPval))) ) / 0.45-3.0 - hit.score_ss;
1562 hit.Probab = Probab(hit);
1563 hit.score_sort = hit.logPval;
1564 Overwrite(hit); // copy hit object into current position of hitlist
1567 for (k=0; k<N; k++){
1568 delete[](Z[k]); (Z[k]) = NULL;
1570 for (k=0; k<N; k++){
1571 delete[](C[k]); (C[k]) = NULL;
1573 for (k=0; k<N; k++){
1574 delete[](fold[k]); (fold[k]) = NULL;
1576 for (k=0; k<N; k++){
1577 delete[](fam[k]); (fam[k]) = NULL;
1579 delete[](C); (C) = NULL;
1580 delete[](Z); (Z) = NULL;
1581 delete[](fold); (fold) = NULL;
1582 delete[](fam); (fam) = NULL;
1583 delete[](Prob); (Prob) = NULL;
1584 delete[](ll); (ll) = NULL;
1585 delete[](Zq); (Zq) = NULL;
1586 delete[](Ztq); (Ztq) = NULL;
1591 //////////////////////////////////////////////////////////////////////////////
1593 * @brief Calculate P-values and Probabilities from transitive scoring over whole database
1596 HitList::TransitiveScoring2()
1598 void PrintMatrix(float** V, int N);
1599 void PrintMatrix(double** V, int N);
1601 float** Z; // matrix of intra-db Z-scores Z_kl
1602 float** C; // covariance matrix for Z_k: C_kl = sum_m=1^N (Z_km * Z_lm)
1603 char** fold; // fold name of HMM k
1604 char** fam; // family of HMM k
1605 float* Prob; // probability of HMM k
1606 float* Zq; // Zq[k] = Z-score between query and database HMM k
1607 float* Ztq; // Ztq[k] = transitive Z-score from query to database HMM k: Ztq[k] = sum_l[ w_ql * Z_lk] / normalization_q
1608 float* Zrq; // Zrq[k] = transitive Z-score from database HMM k to query: Zrq[k] = sum_l[ w_kl * Z_lq] / normalization_k
1609 float* w; // unnormalized weight matrix; w[l] is w_ql or w_kl, respectively
1610 int* ll; // ll[m] is the m'th index l for which Z_lq, Z_lk > Zmin_trans
1611 int N; // dimension of weight matrix is NxN
1612 int M; // number of HMMs l with Z_ql>Ztrans_min (or Z_lk>Ztrans_min, respectively)
1613 int k,l,m,n; // indices for database HMMs
1615 Hash<int> index(MAXPROF+7); // index{name} = index of HMM name in {1,...,N}
1616 index.Null(-1); // Set int value to return when no data can be retrieved
1617 Hash<int> excluded(13); // Hash containing names of superfamilies to be excluded from fit
1618 excluded.Null(0); // Set int value to return when no data can be retrieved
1620 size_t unused; /* disable fread gcc warning */
1622 // Read weights matrix W with index hash and names array
1623 fprintf(stderr,"Reading in weights file\n");
1624 FILE* wfile = fopen(par.wfile,"rb");
1625 if (v>=1 && wfile==NULL)
1627 fprintf(stderr,"Error: %s could not be opened: (N_searched=%i) ",par.wfile,N_searched);
1629 fprintf(stderr,"Skipping caclulation of transitive P-values\n");
1633 unused = fread(&N,sizeof(int),1,wfile); // read matrix dimension (i.e. number of HMMs in database)
1634 if (v>=1 && N!=N_searched)
1636 fprintf(stderr,"Error: Number %i of HMMs in weight file is different from number %i of HMMs in searched databases. \n",N,N_searched);
1637 fprintf(stderr,"Skipping caclulation of transitive P-values\n");
1641 if (v>=2) fprintf(stderr,"Calculating transitive P-values for %i HMMs\n",N);
1642 // Read names of HMMs (to specify mapping of HMM to matrix indices)
1645 unused = fread(name,sizeof(char),IDLEN,wfile);
1648 // Read symmetric Z-scores matrix
1652 Z[k] = new(float[N]);
1653 for (l=0; l<k; l++) Z[k][l] = Z[l][k];
1654 unused = fread(Z[k]+k,sizeof(float),N-k,wfile);
1656 // Read symmetric covariance matrix
1660 C[k] = new(float[N]);
1661 for (l=0; l<k; l++) C[k][l] = C[l][k];
1662 unused = fread(C[k]+k,sizeof(float),N-k,wfile);
1668 Ztq = new(float[N]);
1669 Zrq = new(float[N]);
1670 fold = new(char*[N]);
1671 fam = new(char*[N]);
1672 Prob = new(float[N]);
1676 // Transform P-values to normally distributed Z-scores and store in Zq vector
1677 fprintf(stderr,"Transform P-values to Z-scores\n");
1678 float Zmax_neg = Score2Z( -log(MINEVALEXCL) + log(N_searched) ); // calculate Z-score corresponding to E-value MINEVALEXCL
1679 float Zmin_trans = Score2Z( -log(par.Emax_trans) + log(N_searched) ); // calculate Z-score corresponding to E-value par.Emax_trans
1680 printf("Zmax = %6.2f Zmin = %6.2f \n",Zmax_neg,Zmin_trans);
1686 if (hit.irep>1) continue;
1687 k = index.Show(hit.name);
1688 if (k<0) {fprintf(stderr,"Error: no index found in weights file for domain %s\n",hit.name); exit(1);}
1690 Zq[k] = 0.5*Score2Z(fabs(hit.logPval)) + 0.5*Score2Z(fabs(hit.logPvalt)); // Zq[k] = 0.5*(Zkq + Zqk)
1692 Zq[k] = Score2Z(fabs(hit.logPval)); // Zq[k] = Zqk
1693 // printf("%4i %-10.10s logPvalt=%9g Zq=%9f\n",k,hit.name,hit.logPvalt,Zq[k]);
1694 // if (isnan(Zq[k]))
1696 // fprintf(stderr,"Error: a floating point exception occurred. Skipping transitive scoring\n");
1697 // printf("%4i %-10.10s logPval=%9g logPvalt=%9g Zq=%9f\n",k,hit.name,hit.logPval,hit.logPvalt,Zq[k]);
1701 if (Zq[k]>Zmax_neg) excluded.Add(hit.fold);
1702 fold[k] = new(char[IDLEN]);
1703 fam[k] = new(char[IDLEN]);
1704 strcpy(fold[k],hit.fold);
1705 strcpy(fam[k],hit.fam);
1706 weight[k] = hit.weight;
1707 Prob[k] = hit.Probab;
1713 while (!excluded.End())
1715 excluded.ReadNext(name);
1716 printf("Excluded fold %s from fitting to Ztq\n",name);
1721 ////////////////////////////////////////////////////////////////
1722 // Calculate transitive score (query->l) Zt[l]
1724 // Construct vector ll of indices l for which Z_lq > Zmin_trans
1727 if (Zq[l]>=Zmin_trans) ll[m++]=l;
1728 M = m; // number of indices l for which Z_lq,Z_lk > Zmin_trans
1730 // for (m=0; m<M; m++)
1731 // fprintf(stderr,"m=%-4i l=%-4i %-10.10s Zq[l]=%7f\n",m,ll[m],fam[ll[m]],Zq[ll[m]]);
1734 for (k=0; k<N; k++) Ztq[k]=0.0;
1737 // Generate submatrix of C for indices l for which Z_lq,Z_lk > Zmin_trans
1738 double** Csub = new(double*[M]);
1739 double** Cinv = new(double*[M]);
1742 Csub[m] = new(double[M]);
1743 Cinv[m] = new(double[M]);
1745 Csub[m][n] = double(C[ll[m]][ll[n]]);
1750 fprintf(stderr,"Covariance matrix\n");
1751 PrintMatrix(Csub,M);
1755 // fprintf(stderr,"Calculate inverse of covariance submatrix\n");
1756 // InvertMatrix(Cinv,Csub,M);
1760 // fprintf(stderr,"Inverse covariance matrix\n");
1761 // PrintMatrix(Cinv,M);
1765 // Calculate weights w[l]
1770 sum += 1.0 * Csub[m][n];
1771 printf("w[%4i] = %-8.5f\n",ll[m],1.0/sum);
1772 w[m] = (sum>0? Zq[ll[m]] / sum : 0.0);
1774 for (l=0; l<M; l++){
1775 delete[](Cinv[l]); (Cinv[l]) = NULL;
1777 delete[](Cinv); (Cinv) = NULL;
1779 // Calculate Ztq[k] for all HMMs k
1780 fprintf(stderr,"Calculate Ztq vector of transitive Z-scores\n");
1781 float norm = NormalizationFactor(Csub,w,M);
1786 sumZ += w[m] * Z[ll[m]][k];
1790 for (l=0; l<M; l++){
1791 delete[](Csub[l]); (Csub[l]) = NULL;
1793 delete[](Csub); (Csub) = NULL;
1796 ////////////////////////////////////////////////////////////////
1797 // Calculate reverse transitive score (l->query-) Zrq[l]
1799 fprintf(stderr,"Calculate Zrq vector of transitive Z-scores\n");
1802 // Construct vector ll of indices l for which Z_lk > Zmin_tran
1805 if (Z[l][k]+Z[k][l]>=2*Zmin_trans) ll[m++]=l;
1806 int M = m; // number of indices l for which Z_lq,Z_lk > Zmin_tran
1809 // fprintf(stderr,"\nfam[k]: %s\n",fam[k]);
1810 // for (m=0; m<M; m++)
1811 // printf(stderr,"m=%-4i k=%-4i l=%-4i %-10.10s Zq[l]=%7f Z_lk=%7f \n",m,k,ll[m],fold[ll[m]],Zq[ll[m]],Z[k][ll[m]]);
1819 // Generate submatrix of C for indices l for which Z_lq,Z_lk > Zmin_trans
1820 double** Csub = new(double*[M]);
1823 Csub[m] = new(double[M]);
1825 Csub[m][n] = double(C[ll[m]][ll[n]]);
1827 // fprintf(stderr,"Covariance matrix\n");
1828 // PrintMatrix(Csub,M);
1832 for (m=0; m<M; m++) w[m] = 1.0/M;
1837 double** Cinv = new(double*[M]);
1838 for (m=0; m<M; m++) Cinv[m] = new(double[M]);
1841 // InvertMatrix(Cinv,Csub,M);
1843 // // fprintf(stderr,"Inverse covariance matrix\n");
1844 // // PrintMatrix(Cinv,M);
1846 // Calculate weights w[l]
1851 sum += 1.0 * Csub[m][n];
1852 w[m] = (sum>0? Z[ll[m]][k] / sum : 0.0);
1855 // for (m=0; m<M; m++) fprintf(stderr,"w[%i]=%8.2g\n",m,w[m]);
1857 for (l=0; l<M; l++){
1858 delete[](Cinv[l]); (Cinv[l]) = NULL;
1860 delete[](Cinv); (Cinv) = NULL;
1863 // Calculate Zrq[k] and normalize
1864 float norm = NormalizationFactor(Csub,w,M);
1867 sumZ += w[m] * Zq[ll[m]];
1870 for (l=0; l<M; l++){
1871 delete[](Csub[l]); (Csub[l]) = NULL;
1873 delete[](Csub); (Csub) = NULL;
1876 // fprintf(stderr,"\nZq[k]=%8.2g Zq1[k]=%8.2g\n",Zq[k],Zrq[k]);
1879 // Total Z-score = weighted sum over original Z-score, forward transitive and reverse transitive Z-score
1882 float Zqtot = Zq[k] + par.wtrans*(Ztq[k]+Zrq[k]);
1883 // if (isnan(Zqtot))
1885 // fprintf(stderr,"Error: a floating point exception occurred. Skipping transitive scoring\n");
1886 // printf("%4i %-10.10s Zq=%6.2f Ztq=%6.2f Zrq=%6.2f Zqtot=%6.2f\n",k,fam[k],Zq[k],Ztq[k],Zrq[k],Zqtot);
1890 if (v>=2 && Zq[k] + Zqtot > 2*Zmin_trans) {
1891 printf("%4i %-10.10s Zq=%6.2f Ztq=%6.2f Zrq=%6.2f -> Zqtot=%6.2f\n",k,fam[k],Zq[k],Ztq[k],Zrq[k],Zqtot);
1896 // Calculate mean and standard deviation of Z1q
1897 fprintf(stderr,"Calculate mean and standard deviation of Ztq\n");
1903 if (excluded.Contains(fold[k])) continue;
1905 sumZ += weight[k]*Ztq[k];
1906 sumZ2 += weight[k]*Ztq[k]*Ztq[k];
1909 // fprintf(stderr,"Error: a floating point exception occurred. Skipping transitive scoring\n");
1910 // printf("%4i %-10.10s Zq=%9f Zrq=%9f Ztq=%9f\n",k,fam[k],Zq[k],Zrq[k],Ztq[k]);
1915 float mu = sumZ/sumw;
1916 float sigma = sqrt(sumZ2/sumw-mu*mu);
1917 if (v>=2) printf("mu(Ztq)=%6.3f sigma(Ztq)=%6.2f\n",mu,sigma);
1918 sigma *= 1.01;// correct different fitting of EVD and normal variables
1920 // Normalize Ztq and calculate P1-values
1921 fprintf(stderr,"Normalize Ztq and calculate P1-values\n");
1926 hit.logPval = -Z2Score((Ztq[index.Show(hit.name)]-mu)/sigma);
1927 hit.E1val = N_searched*(hit.logPval<-100? 0.0 : exp(hit.logPval));
1928 // P-value = 1- exp(-exp(-lamda*(Saa-mu))) => -lamda*(Saa-mu) = log(-log(1-Pvalue))
1929 hit.score_aass = (hit.logPval<-10.0? hit.logPval : log(-log(1-exp(hit.logPval))) ) / 0.45-3.0 - hit.score_ss;
1930 hit.Probab = Probab(hit);
1931 hit.score_sort = hit.logPval;
1932 Overwrite(hit); // copy hit object into current position of hitlist
1935 for (k=0; k<N; k++){
1936 delete[](Z[k]); (Z[k]) = NULL;
1938 for (k=0; k<N; k++){
1939 delete[](C[k]); (C[k]) = NULL;
1941 for (k=0; k<N; k++){
1942 delete[](fold[k]); (fold[k]) = NULL;
1944 for (k=0; k<N; k++){
1945 delete[](fam[k]); (fam[k]) = NULL;
1947 delete[](C); (C) = NULL;
1948 delete[](Z); (Z) = NULL;
1949 delete[](fold); (fold) = NULL;
1950 delete[](fam); (fam) = NULL;
1951 delete[](Prob); (Prob) = NULL;
1952 delete[](ll); (ll) = NULL;
1953 delete[](Zq); (Zq) = NULL;
1954 delete[](Ztq); (Ztq) = NULL;
1958 /////////////////////////////////////////////////////////////////////////////////////
1960 * @brief Calculate P-values and Probabilities from transitive scoring over whole database
1961 * Like TransitiveScoring(),
1962 * but in transitive scoring, Z1_qk = sum_l w_l*Z_lk, use all l:E_ql<=E_qk
1963 * and in reverse scoring, Z1_kr = sum_l w_l*Z_lq, use all l:E_kl<=E_kq
1966 HitList::TransitiveScoring3()
1968 void PrintMatrix(float** V, int N);
1969 void PrintMatrix(double** V, int N);
1971 float** Z; // matrix of intra-db Z-scores Z_kl
1972 float** C; // covariance matrix for Z_k: C_kl = sum_m=1^N (Z_km * Z_lm)
1973 char** fold; // fold name of HMM k
1974 char** fam; // family of HMM k
1975 float* Prob; // probability of HMM k
1976 float* Zq; // Zq[k] = Z-score between query and database HMM k
1977 float* Ztq; // Ztq[k] = transitive Z-score from query to database HMM k: Ztq[k] = sum_l[ w_ql * Z_lk] / normalization_q
1978 float* Zrq; // Zrq[k] = transitive Z-score from database HMM k to query: Zrq[k] = sum_l[ w_kl * Z_lq] / normalization_k
1979 float* w; // unnormalized weight matrix; w[l] is w_ql or w_kl, respectively
1980 int* ll; // ll[m] is the m'th index l for which Z_lq, Z_lk > Zmin_trans
1981 int N; // dimension of weight matrix is NxN
1982 int M; // number of HMMs l with Z_ql>Ztrans_min (or Z_lk>Ztrans_min, respectively)
1983 int k,l,m,n; // indices for database HMMs
1985 Hash<int> index(MAXPROF+7); // index{name} = index of HMM name in {1,...,N}
1986 index.Null(-1); // Set int value to return when no data can be retrieved
1987 Hash<int> excluded(13); // Hash containing names of superfamilies to be excluded from fit
1988 excluded.Null(0); // Set int value to return when no data can be retrieved
1990 size_t unused; /* disable fread gcc warning */
1992 // Read weights matrix W with index hash and names array
1993 fprintf(stderr,"Reading in weights file\n");
1994 FILE* wfile = fopen(par.wfile,"rb");
1995 if (v>=1 && wfile==NULL)
1997 fprintf(stderr,"Error: %s could not be opened: (N_searched=%i) ",par.wfile,N_searched);
1999 fprintf(stderr,"Skipping caclulation of transitive P-values\n");
2003 unused = fread(&N,sizeof(int),1,wfile); // read matrix dimension (i.e. number of HMMs in database)
2004 if (v>=1 && N!=N_searched)
2006 fprintf(stderr,"Error: Number %i of HMMs in weight file is different from number %i of HMMs in searched databases. \n",N,N_searched);
2007 fprintf(stderr,"Skipping caclulation of transitive P-values\n");
2011 if (v>=2) fprintf(stderr,"Calculating transitive P-values for %i HMMs\n",N);
2012 // Read names of HMMs (to specify mapping of HMM to matrix indices)
2015 unused = fread(name,sizeof(char),IDLEN,wfile);
2018 // Read symmetric Z-scores matrix
2022 Z[k] = new(float[N]);
2023 for (l=0; l<k; l++) Z[k][l] = Z[l][k];
2024 unused = fread(Z[k]+k,sizeof(float),N-k,wfile);
2026 // Read symmetric covariance matrix
2030 C[k] = new(float[N]);
2031 for (l=0; l<k; l++) C[k][l] = C[l][k];
2032 unused = fread(C[k]+k,sizeof(float),N-k,wfile);
2038 Ztq = new(float[N]);
2039 Zrq = new(float[N]);
2040 fold = new(char*[N]);
2041 fam = new(char*[N]);
2042 Prob = new(float[N]);
2046 // Transform P-values to normally distributed Z-scores and store in Zq vector
2047 fprintf(stderr,"Transform P-values to Z-scores\n");
2048 float Zmax_neg = Score2Z( -log(MINEVALEXCL) + log(N_searched) ); // calculate Z-score corresponding to E-value MINEVALEXCL
2049 float Zmin_trans = Score2Z( -log(par.Emax_trans) + log(N_searched) ); // calculate Z-score corresponding to E-value par.Emax_trans
2050 printf("Zmax = %6.2f Zmin = %6.2f \n",Zmax_neg,Zmin_trans);
2056 if (hit.irep>1) continue;
2057 k = index.Show(hit.name);
2058 if (k<0) {fprintf(stderr,"Error: no index found in weights file for domain %s\n",hit.name); exit(1);}
2060 Zq[k] = 0.5*Score2Z(fabs(hit.logPval)) + 0.5*Score2Z(fabs(hit.logPvalt)); // Zq[k] = 0.5*(Zkq + Zqk)
2062 Zq[k] = Score2Z(fabs(hit.logPval)); // Zq[k] = Zqk
2063 // printf("%4i %-10.10s logPvalt=%9g Zq=%9f\n",k,hit.name,hit.logPvalt,Zq[k]);
2064 // if (isnan(Zq[k]))
2066 // fprintf(stderr,"Error: a floating point exception occurred. Skipping transitive scoring\n");
2067 // printf("%4i %-10.10s logPval=%9g logPvalt=%9g Zq=%9f\n",k,hit.name,hit.logPval,hit.logPvalt,Zq[k]);
2071 if (Zq[k]>Zmax_neg) excluded.Add(hit.fold);
2072 fold[k] = new(char[IDLEN]);
2073 fam[k] = new(char[IDLEN]);
2074 strcpy(fold[k],hit.fold);
2075 strcpy(fam[k],hit.fam);
2076 weight[k] = hit.weight;
2077 Prob[k] = hit.Probab;
2083 while (!excluded.End())
2085 excluded.ReadNext(name);
2086 printf("Excluded fold %s from fitting to Ztq\n",name);
2091 ////////////////////////////////////////////////////////////////
2092 // Calculate transitive score (query->l) Ztq[l]
2094 fprintf(stderr,"Calculate Ztq vector of transitive Z-scores\n");
2097 // Construct vector ll of indices l for which Z_lq OR Z_lk >= max(Z_kq,Zmin_trans)
2098 float Zmink = fmax(Zq[k],Zmin_trans);
2099 for (m=l=0; l<N; l++)
2100 if (Zq[l]>=Zmink) ll[m++]=l;
2101 M = m; // number of indices l for which Z_lq OR Z_lk >= max(Z_kq,Zmin_trans)
2103 // for (m=0; m<M; m++)
2104 // fprintf(stderr,"m=%-4i l=%-4i %-10.10s Zq[l]=%7f\n",m,ll[m],fam[ll[m]],Zq[ll[m]]);
2112 // Generate submatrix of C for indices l for which Z_lq,Z_lk > Zmin_trans
2113 double** Csub = new(double*[M]);
2114 double** Cinv = new(double*[M]);
2117 Csub[m] = new(double[M]);
2118 Cinv[m] = new(double[M]);
2120 Csub[m][n] = double(C[ll[m]][ll[n]]);
2123 // fprintf(stderr,"Covariance matrix\n");
2124 // PrintMatrix(Csub,M);
2127 // fprintf(stderr,"Calculate inverse of covariance submatrix\n");
2128 InvertMatrix(Cinv,Csub,M);
2130 // fprintf(stderr,"Inverse covariance matrix\n");
2131 // PrintMatrix(Cinv,M);
2133 // Calculate weights w[l]
2138 sum += 1.0 * Cinv[m][n]; // signal ~ sum_l w_l*Z_lq !
2139 w[m] = fmax(sum,0.0);
2141 for (l=0; l<M; l++){
2142 delete[](Cinv[l]); (Cinv[l]) = NULL;
2144 delete[](Cinv); (Cinv) = NULL;
2147 float norm = NormalizationFactor(Csub,w,M);
2150 sumZ += w[m] * fmin(Zq[ll[m]],Z[ll[m]][k]);
2151 // sumZ += w[m] * Z[ll[m]][k];
2154 for (l=0; l<M; l++){
2155 delete[](Csub[l]); (Csub[l]) = NULL;
2157 delete[](Csub); (Csub) = NULL;
2161 ////////////////////////////////////////////////////////////////
2162 // Calculate reverse transitive score (l->query-) Zrq[l]
2164 fprintf(stderr,"Calculate Zrq vector of transitive Z-scores\n");
2167 // Construct vector ll of indices l for which Z_lk > Zmin_tran
2168 float Zmink = fmax(Zq[k],Zmin_trans);
2169 for (m=l=0; l<N; l++)
2170 if (Z[l][k]>=Zmink) ll[m++]=l;
2171 int M = m; // number of indices l for which Z_lq,Z_lk > Zmin_tran
2174 // fprintf(stderr,"\nfam[k]: %s\n",fam[k]);
2175 // for (m=0; m<M; m++)
2176 // printf(stderr,"m=%-4i k=%-4i l=%-4i %-10.10s Zq[l]=%7f Z_lk=%7f \n",m,k,ll[m],fold[ll[m]],Zq[ll[m]],Z[k][ll[m]]);
2184 // Generate submatrix of C for indices l for which Z_lq,Z_lk > Zmin_trans
2185 double** Csub = new(double*[M]);
2188 Csub[m] = new(double[M]);
2190 Csub[m][n] = double(C[ll[m]][ll[n]]);
2192 // fprintf(stderr,"Covariance matrix\n");
2193 // PrintMatrix(Csub,M);
2197 for (m=0; m<M; m++) w[m] = 1.0/M;
2202 double** Cinv = new(double*[M]);
2203 for (m=0; m<M; m++) Cinv[m] = new(double[M]);
2206 InvertMatrix(Cinv,Csub,M);
2208 // fprintf(stderr,"Inverse covariance matrix\n");
2209 // PrintMatrix(Cinv,M);
2211 // Calculate weights w[l]
2216 sum += 1.0 * Cinv[m][n]; // signal ~ sum_l w_l*Z_lq !
2217 w[m] = fmax(sum,0.0);
2219 // for (m=0; m<M; m++) fprintf(stderr,"w[%i]=%8.2g\n",m,w[m]);
2220 for (l=0; l<M; l++){
2221 delete[](Cinv[l]); (Cinv[l]) = NULL;
2223 delete[](Cinv); (Cinv) = NULL;
2226 // Calculate Zrq[k] and normalize
2227 float norm = NormalizationFactor(Csub,w,M);
2230 sumZ += w[m] * fmin(Zq[ll[m]],Z[ll[m]][k]);
2231 // sumZ += w[m] * Zq[ll[m]];
2234 for (l=0; l<M; l++){
2235 delete[](Csub[l]); (Csub[l]) = NULL;
2237 delete[](Csub); (Csub) = NULL;
2240 // fprintf(stderr,"\nZq[k]=%8.2g Zq1[k]=%8.2g\n",Zq[k],Zrq[k]);
2243 // Total Z-score = weighted sum over original Z-score, forward transitive and reverse transitive Z-score
2247 float Zqtot = Zq[k] + par.wtrans*(Ztq[k]+Zrq[k]);
2248 // if (isnan(Zqtot))
2250 // fprintf(stderr,"Error: a floating point exception occurred. Skipping transitive scoring\n");
2251 // printf("%4i %-10.10s Zq=%6.2f Ztq=%6.2f Zrq=%6.2f -> Zqtot=%6.2f\n",k,fam[k],Zq[k],Ztq[k],Zrq[k],Zqtot);
2255 if (v>=3 && Zqtot > 2*Zmin_trans) {
2256 printf("%4i %-10.10s Zq=%6.2f Ztq=%6.2f Zrq=%6.2f -> Zqtot=%6.2f\n",k,fam[k],Zq[k],Ztq[k],Zrq[k],Zqtot);
2261 // Calculate mean and standard deviation of Z1q
2262 fprintf(stderr,"Calculate mean and standard deviation of Ztq\n");
2268 if (excluded.Contains(fold[k])) continue;
2270 sumZ += weight[k]*Ztq[k];
2271 sumZ2 += weight[k]*Ztq[k]*Ztq[k];
2274 // fprintf(stderr,"Error: a floating point exception occurred. Skipping transitive scoring\n");
2275 // printf("%4i %-10.10s Zq=%9f Zrq=%9f Ztq=%9f\n",k,fam[k],Zq[k],Zrq[k],Ztq[k]);
2280 float mu = sumZ/sumw;
2281 float sigma = sqrt(sumZ2/sumw-mu*mu);
2282 if (v>=2) printf("mu(Ztq)=%6.3f sigma(Ztq)=%6.2f\n",mu,sigma);
2283 sigma *= 1.01;// correct different fitting of EVD and normal variables
2285 // Normalize Ztq and calculate P1-values
2286 fprintf(stderr,"Normalize Ztq and calculate P1-values\n");
2291 hit.logPval = -Z2Score((Ztq[index.Show(hit.name)]-mu)/sigma);
2292 hit.E1val = N_searched*(hit.logPval<-100? 0.0 : exp(hit.logPval));
2293 // P-value = 1- exp(-exp(-lamda*(Saa-mu))) => -lamda*(Saa-mu) = log(-log(1-Pvalue))
2294 hit.score_aass = (hit.logPval<-10.0? hit.logPval : log(-log(1-exp(hit.logPval))) ) / 0.45-3.0 - hit.score_ss;
2295 hit.Probab = Probab(hit);
2296 hit.score_sort = hit.logPval;
2297 Overwrite(hit); // copy hit object into current position of hitlist
2300 for (k=0; k<N; k++){
2301 delete[](Z[k]); (Z[k]) = NULL;
2303 for (k=0; k<N; k++){
2304 delete[](C[k]); (C[k]) = NULL;
2306 for (k=0; k<N; k++){
2307 delete[](fold[k]); (fold[k]) = NULL;
2309 for (k=0; k<N; k++){
2310 delete[](fam[k]); (fam[k]) = NULL;
2312 delete[](C); (C) = NULL;
2313 delete[](Z); (Z) = NULL;
2314 delete[](fold); (fold) = NULL;
2315 delete[](fam); (fam) = NULL;
2316 delete[](Prob); (Prob) = NULL;
2317 delete[](ll); (ll) = NULL;
2318 delete[](Zq); (Zq) = NULL;
2319 delete[](Ztq); (Ztq) = NULL;
2324 /////////////////////////////////////////////////////////////////////////////////////
2326 * @brief Calculate P-values and Probabilities from transitive scoring over whole database
2327 * Best tested scheme. Use fmin(Zq[ll[m]],Z[ll[m]][k])
2328 * and fast approximation for weights (not inverse covariance matrix)
2331 HitList::TransitiveScoring4()
2333 void PrintMatrix(float** V, int N);
2334 void PrintMatrix(double** V, int N);
2336 float** Z; // matrix of intra-db Z-scores Z_kl
2337 float** C; // covariance matrix for Z_k: C_kl = sum_m=1^N (Z_km * Z_lm)
2338 char** fold; // fold name of HMM k
2339 char** fam; // family of HMM k
2340 float* Prob; // probability of HMM k
2341 float* Zq; // Zq[k] = Z-score between query and database HMM k
2342 float* Ztq; // Ztq[k] = transitive Z-score from query to database HMM k: Ztq[k] = sum_l[ w_ql * Z_lk] / normalization_q
2343 float* Zrq; // Zrq[k] = transitive Z-score from database HMM k to query: Zrq[k] = sum_l[ w_kl * Z_lq] / normalization_k
2344 float* w; // unnormalized weight matrix; w[l] is w_ql or w_kl, respectively
2345 int* ll; // ll[m] is the m'th index l for which Z_lq, Z_lk > Zmin_trans
2346 int N; // dimension of weight matrix is NxN
2347 int M; // number of HMMs l with Z_ql>Ztrans_min (or Z_lk>Ztrans_min, respectively)
2348 int k,l,m,n; // indices for database HMMs
2350 Hash<int> index(MAXPROF+7); // index{name} = index of HMM name in {1,...,N}
2351 index.Null(-1); // Set int value to return when no data can be retrieved
2352 Hash<int> excluded(13); // Hash containing names of superfamilies to be excluded from fit
2353 excluded.Null(0); // Set int value to return when no data can be retrieved
2355 size_t unused; /* disable fread gcc warning */
2357 // Read weights matrix W with index hash and names array
2358 fprintf(stderr,"Reading in weights file\n");
2359 FILE* wfile = fopen(par.wfile,"rb");
2360 if (v>=1 && wfile==NULL)
2362 fprintf(stderr,"Error: %s could not be opened: (N_searched=%i) ",par.wfile,N_searched);
2364 fprintf(stderr,"Skipping caclulation of transitive P-values\n");
2368 unused = fread(&N,sizeof(int),1,wfile); // read matrix dimension (i.e. number of HMMs in database)
2369 if (v>=1 && N!=N_searched)
2371 fprintf(stderr,"Error: Number %i of HMMs in weight file is different from number %i of HMMs in searched databases. \n",N,N_searched);
2372 fprintf(stderr,"Skipping caclulation of transitive P-values\n");
2376 if (v>=2) fprintf(stderr,"Calculating transitive P-values for %i HMMs\n",N);
2377 // Read names of HMMs (to specify mapping of HMM to matrix indices)
2380 unused = fread(name,sizeof(char),IDLEN,wfile);
2383 // Read symmetric Z-scores matrix
2387 Z[k] = new(float[N]);
2388 for (l=0; l<k; l++) Z[k][l] = Z[l][k];
2389 unused = fread(Z[k]+k,sizeof(float),N-k,wfile);
2391 // Read symmetric covariance matrix
2395 C[k] = new(float[N]);
2396 for (l=0; l<k; l++) C[k][l] = C[l][k];
2397 unused = fread(C[k]+k,sizeof(float),N-k,wfile);
2403 Ztq = new(float[N]);
2404 Zrq = new(float[N]);
2405 fold = new(char*[N]);
2406 fam = new(char*[N]);
2407 Prob = new(float[N]);
2411 // Transform P-values to normally distributed Z-scores and store in Zq vector
2412 fprintf(stderr,"Transform P-values to Z-scores\n");
2413 float Zmax_neg = Score2Z( -log(MINEVALEXCL) + log(N_searched) ); // calculate Z-score corresponding to E-value MINEVALEXCL
2414 float Zmin_trans = Score2Z( -log(par.Emax_trans) + log(N_searched) ); // calculate Z-score corresponding to E-value par.Emax_trans
2415 printf("Zmax = %6.2f Zmin = %6.2f \n",Zmax_neg,Zmin_trans);
2421 if (hit.irep>1) continue;
2422 k = index.Show(hit.name);
2423 if (k<0) {fprintf(stderr,"Error: no index found in weights file for domain %s\n",hit.name); exit(1);}
2425 Zq[k] = 0.5*Score2Z(fabs(hit.logPval)) + 0.5*Score2Z(fabs(hit.logPvalt)); // Zq[k] = 0.5*(Zkq + Zqk)
2427 Zq[k] = Score2Z(fabs(hit.logPval)); // Zq[k] = Zqk
2428 // printf("%4i %-10.10s logPvalt=%9g Zq=%9f\n",k,hit.name,hit.logPvalt,Zq[k]);
2429 // if (isnan(Zq[k])) {
2430 // fprintf(stderr,"Error: a floating point exception occurred. Skipping transitive scoring\n");
2431 // printf("%4i %-10.10s logPval=%9g logPvalt=%9g Zq=%9f\n",k,hit.name,hit.logPval,hit.logPvalt,Zq[k]);
2435 if (Zq[k]>Zmax_neg) excluded.Add(hit.fold);
2436 fold[k] = new(char[IDLEN]);
2437 fam[k] = new(char[IDLEN]);
2438 strcpy(fold[k],hit.fold);
2439 strcpy(fam[k],hit.fam);
2440 weight[k] = hit.weight;
2441 Prob[k] = hit.Probab;
2447 while (!excluded.End())
2449 excluded.ReadNext(name);
2450 printf("Excluded fold %s from fitting to Ztq\n",name);
2454 ////////////////////////////////////////////////////////////////
2455 // Calculate transitive score (query->l) Zt[l]
2457 // Construct vector ll of indices l for which Z_lq > Zmin_trans
2460 if (Zq[l]>=Zmin_trans) ll[m++]=l;
2461 M = m; // number of indices l for which Z_lq,Z_lk > Zmin_trans
2463 // for (m=0; m<M; m++)
2464 // fprintf(stderr,"m=%-4i l=%-4i %-10.10s Zq[l]=%7f\n",m,ll[m],fam[ll[m]],Zq[ll[m]]);
2467 for (k=0; k<N; k++) Ztq[k]=0.0;
2470 // Generate submatrix of C for indices l for which Z_lq,Z_lk > Zmin_trans
2471 double** Csub = new(double*[M]);
2474 Csub[m] = new(double[M]);
2476 Csub[m][n] = double(C[ll[m]][ll[n]]);
2481 fprintf(stderr,"Covariance matrix\n");
2482 PrintMatrix(Csub,M);
2486 // Calculate weights w[l]
2491 sum += fmax(0.0,Csub[m][n]);
2492 printf("w[%4i] = %-8.5f\n",ll[m],1.0/sum);
2496 // Calculate Ztq[k] for all HMMs k
2497 fprintf(stderr,"Calculate Ztq vector of transitive Z-scores\n");
2498 float norm = NormalizationFactor(Csub,w,M);
2503 sumZ += w[m] * fmin(Zq[ll[m]],Z[ll[m]][k]);
2507 for (l=0; l<M; l++){
2508 delete[](Csub[l]); (Csub[l]) = NULL;
2510 delete[](Csub); (Csub) = NULL;
2513 ////////////////////////////////////////////////////////////////
2514 // Calculate reverse transitive score (l->query-) Zrq[l]
2516 fprintf(stderr,"Calculate Zrq vector of transitive Z-scores\n");
2519 // Construct vector ll of indices l for which Z_lk > Zmin_tran
2522 if (Z[k][l]>=Zmin_trans) ll[m++]=l;
2523 int M = m; // number of indices l for which Z_lq,Z_lk > Zmin_tran
2526 // fprintf(stderr,"\nfam[k]: %s\n",fam[k]);
2527 // for (m=0; m<M; m++)
2528 // printf(stderr,"m=%-4i k=%-4i l=%-4i %-10.10s Zq[l]=%7f Z_lk=%7f \n",m,k,ll[m],fold[ll[m]],Zq[ll[m]],Z[k][ll[m]]);
2536 // Generate submatrix of C for indices l for which Z_lq,Z_lk > Zmin_trans
2537 double** Csub = new(double*[M]);
2540 Csub[m] = new(double[M]);
2542 Csub[m][n] = double(C[ll[m]][ll[n]]);
2544 // fprintf(stderr,"Covariance matrix\n");
2545 // PrintMatrix(Csub,M);
2547 // Calculate weights w[l]
2552 sum += fmax(0.0,Csub[m][n]);
2556 // for (m=0; m<M; m++) fprintf(stderr,"w[%i]=%8.2g\n",m,w[m]);
2559 // Calculate Zrq[k] and normalize
2560 float norm = NormalizationFactor(Csub,w,M);
2563 sumZ += w[m] * fmin(Zq[ll[m]],Z[ll[m]][k]);
2566 for (l=0; l<M; l++){
2567 delete[](Csub[l]); (Csub[l]) = NULL;
2569 delete[](Csub); (Csub) = NULL;
2572 // fprintf(stderr,"\nZq[k]=%8.2g Zq1[k]=%8.2g\n",Zq[k],Zrq[k]);
2575 // Total Z-score = weighted sum over original Z-score, forward transitive and reverse transitive Z-score
2578 float Zqtot = Zq[k] + par.wtrans*(Ztq[k]+Zrq[k]);
2579 // if (isnan(Zqtot))
2581 // fprintf(stderr,"Error: a floating point exception occurred. Skipping transitive scoring\n");
2582 // printf("%4i %-10.10s Zq=%6.2f Ztq=%6.2f Zrq=%6.2f Zqtot=%6.2f\n",k,fam[k],Zq[k],Ztq[k],Zrq[k],Zqtot);
2586 if (v>=3 && Zq[k] + Zqtot > 2*Zmin_trans) {
2587 printf("%4i %-10.10s Zq=%6.2f Ztq=%6.2f Zrq=%6.2f -> Zqtot=%6.2f\n",k,fam[k],Zq[k],Ztq[k],Zrq[k],Zqtot);
2592 // Calculate mean and standard deviation of Z1q
2593 fprintf(stderr,"Calculate mean and standard deviation of Ztq\n");
2599 if (excluded.Contains(fold[k])) continue;
2601 sumZ += weight[k]*Ztq[k];
2602 sumZ2 += weight[k]*Ztq[k]*Ztq[k];
2605 // fprintf(stderr,"Error: a floating point exception occurred. Skipping transitive scoring\n");
2606 // printf("%4i %-10.10s Zq=%9f Zrq=%9f Ztq=%9f\n",k,fam[k],Zq[k],Zrq[k],Ztq[k]);
2611 float mu = sumZ/sumw;
2612 float sigma = sqrt(sumZ2/sumw-mu*mu);
2613 if (v>=2) printf("mu(Ztq)=%6.3f sigma(Ztq)=%6.2f\n",mu,sigma);
2614 sigma *= 1.01;// correct different fitting of EVD and normal variables
2616 // Normalize Ztq and calculate P1-values
2617 fprintf(stderr,"Normalize Ztq and calculate P1-values\n");
2622 hit.logPval = -Z2Score((Ztq[index.Show(hit.name)]-mu)/sigma);
2623 hit.E1val = N_searched*(hit.logPval<-100? 0.0 : exp(hit.logPval));
2624 // P-value = 1- exp(-exp(-lamda*(Saa-mu))) => -lamda*(Saa-mu) = log(-log(1-Pvalue))
2625 hit.score_aass = (hit.logPval<-10.0? hit.logPval : log(-log(1-exp(hit.logPval))) ) / 0.45-3.0 - hit.score_ss;
2626 hit.Probab = Probab(hit);
2627 hit.score_sort = hit.logPval;
2628 Overwrite(hit); // copy hit object into current position of hitlist
2631 for (k=0; k<N; k++){
2632 delete[](Z[k]); (Z[k]) = NULL;
2634 for (k=0; k<N; k++){
2635 delete[](C[k]); (C[k]) = NULL;
2637 for (k=0; k<N; k++){
2638 delete[](fold[k]); (fold[k]) = NULL;
2640 for (k=0; k<N; k++){
2641 delete[](fam[k]); (fam[k]) = NULL;
2643 delete[](C); (C) = NULL;
2644 delete[](Z); (Z) = NULL;
2645 delete[](fold); (fold) = NULL;
2646 delete[](fam); (fam) = NULL;
2647 delete[](Prob); (Prob) = NULL;
2648 delete[](ll); (ll) = NULL;
2649 delete[](Zq); (Zq) = NULL;
2650 delete[](Ztq); (Ztq) = NULL;
2654 /////////////////////////////////////////////////////////////////////////////////////
2656 * @brief Score2Z transforms the -log(P-value) score into a Z-score for 0 < S
2657 * Score2Z(S) = sqrt(2)*dierfc(2*e^(-S)), where dierfc is the inverse of the complementary error function
2660 HitList::Score2Z(double S)
2662 double s, t, u, w, x, y, z;
2663 if (S<=0) return double(-100000);
2664 y = ( S>200 ? 0.0 : 2.0*exp(-S) );
2667 z = (S<1e-6? 2*S : 2-y);
2668 w = 0.916461398268964 - log(z);
2673 w = 0.916461398268964 - (0.69314718056-S);
2677 s = (log(u) + 0.488826640273108) / w;
2678 t = 1 / (u + 0.231729200323405);
2680 x = u * (1 - s * (s * 0.124610454613712 + 0.5)) -
2681 ((((-0.0728846765585675 * t + 0.269999308670029) * t +
2682 0.150689047360223) * t + 0.116065025341614) * t +
2683 0.499999303439796) * t;
2684 t = 3.97886080735226 / (x + 3.97886080735226);
2686 s = (((((((((0.00112648096188977922 * u +
2687 1.05739299623423047e-4) * u - 0.00351287146129100025) * u -
2688 7.71708358954120939e-4) * u + 0.00685649426074558612) * u +
2689 0.00339721910367775861) * u - 0.011274916933250487) * u -
2690 0.0118598117047771104) * u + 0.0142961988697898018) * u +
2691 0.0346494207789099922) * u + 0.00220995927012179067;
2692 s = ((((((((((((s * u - 0.0743424357241784861) * u -
2693 0.105872177941595488) * u + 0.0147297938331485121) * u +
2694 0.316847638520135944) * u + 0.713657635868730364) * u +
2695 1.05375024970847138) * u + 1.21448730779995237) * u +
2696 1.16374581931560831) * u + 0.956464974744799006) * u +
2697 0.686265948274097816) * u + 0.434397492331430115) * u +
2698 0.244044510593190935) * t -
2699 (z==0? 0: z * exp(x * x - 0.120782237635245222));
2700 x += s * (x * s + 1);
2704 return double (1.41421356237*x);
2707 /////////////////////////////////////////////////////////////////////////////////////////////////////////
2709 * @brief Z2Score transforms the Z-score into a -log(P-value) value
2710 * Z2Score(Z) = log(2) - log( erfc(Z/sqrt(2)) ) , where derfc is the complementary error function
2713 HitList::Z2Score(double Z)
2716 x = 0.707106781188*Z;
2717 if (x>10) return 0.69314718056 - (-x*x - log( (1-0.5/x/x)/x/1.772453851) );
2718 t = 3.97886080735226 / (fabs(x) + 3.97886080735226);
2720 y = (((((((((0.00127109764952614092 * u + 1.19314022838340944e-4) * u -
2721 0.003963850973605135) * u - 8.70779635317295828e-4) * u +
2722 0.00773672528313526668) * u + 0.00383335126264887303) * u -
2723 0.0127223813782122755) * u - 0.0133823644533460069) * u +
2724 0.0161315329733252248) * u + 0.0390976845588484035) * u +
2725 0.00249367200053503304;
2726 y = ((((((((((((y * u - 0.0838864557023001992) * u -
2727 0.119463959964325415) * u + 0.0166207924969367356) * u +
2728 0.357524274449531043) * u + 0.805276408752910567) * u +
2729 1.18902982909273333) * u + 1.37040217682338167) * u +
2730 1.31314653831023098) * u + 1.07925515155856677) * u +
2731 0.774368199119538609) * u + 0.490165080585318424) * u +
2732 0.275374741597376782) * t * (x>10? 0.0 : exp(-x * x));
2733 return 0.69314718056 - log( x < 0 ? 2 - y : y );
2737 /////////////////////////////////////////////////////////////////////////////////////////////////////////
2742 PrintMatrix(float** V, int N)
2747 fprintf(stderr,"k=%4i \n",k);
2750 fprintf(stderr,"%4i:%6.3f ",l,V[k][l]);
2751 if ((l+1)%10==0) fprintf(stderr,"\n");
2753 fprintf(stderr,"\n");
2755 fprintf(stderr,"\n");
2758 /////////////////////////////////////////////////////////////////////////////////////////////////////////
2763 PrintMatrix(double** V, int N)
2768 fprintf(stderr,"k=%4i \n",k);
2771 fprintf(stderr,"%4i:%6.3f ",l,V[k][l]);
2772 if ((l+1)%10==0) fprintf(stderr,"\n");
2774 fprintf(stderr,"\n");
2776 fprintf(stderr,"\n");
2779 /////////////////////////////////////////////////////////////////////////////////////////////////////////
2784 HitList::Normalize(float* Ztq, char** fold, Hash<int>& excluded)
2789 for (int k=0; k<N_searched; k++)
2791 if (excluded.Contains(fold[k])) continue;
2793 sumZ += weight[k]*Ztq[k];
2794 sumZ2 += weight[k]*Ztq[k]*Ztq[k];
2796 float mu = sumZ/sumw;
2797 float sigma = sqrt(sumZ2/sumw-mu*mu);
2798 printf("Transitive score Ztq: mu=%8.3g sigma=%8.3g\n",mu,sigma);
2799 for (int k=0; k<N_searched; k++) Ztq[k] = (Ztq[k]-mu)/sigma;
2803 /////////////////////////////////////////////////////////////////////////////////////////////////////////
2805 * @brief Calculate standard deviation of Z1 = sum_m [ w_m * Z_m ], where Csub_mn = cov(Z_m,Z_n)
2808 HitList::NormalizationFactor(double** Csub, float* w, int M)
2811 for (int m=0; m<M; m++)
2814 for (int n=0; n<M; n++) summ += Csub[m][n]*w[n];
2820 /////////////////////////////////////////////////////////////////////////////////////////////////////////
2822 * @brief Calculate inverse of matrix A and store result in B
2825 HitList::InvertMatrix(double** B, double** A, int N)
2829 printf("Error: InvertMatrix called with matrix of dimension 0\n");
2834 B[0][0] = (A[0][0]==0.0? 0 :1.0/A[0][0]);
2839 double** V = new(double*[N]);
2840 double* s = new(double[N]);
2841 for (k=0; k<N; k++) V[k] = new(double[N]);
2843 // Copy original matrix A into B since B will be overwritten by SVD()
2848 SVD(B, N, s, V); // U replaces B on output; s[] contains singluar values
2850 // Calculate inverse of A: A^-1 = V * diag(1/s) * U^t
2852 // Calculate V[k][m] -> V[k][m] *diag(1/s)
2855 if (s[m]!=0.0) V[k][m] /= s[m]; else V[k][m] = 0.0;
2856 // Calculate V[k][l] -> (V * U^t)_kl
2859 if (v>=4 && k%100==0) printf("%i\n",k);
2862 s[l] = 0.0; // use s[] as temporary memory to avoid overwriting B[k][] as long as it is needed
2864 s[l] += V[k][m]*U[l][m];
2866 for (l=0; l<N; l++) V[k][l]=s[l];
2872 for (k=0; k<N; k++){
2873 delete[](V[k]); (V[k]) = NULL;
2875 delete[](V); (V) = NULL;
2880 /////////////////////////////////////////////////////////////////////////////////////////////////////////
2885 HitList::TransposeMatrix(double** V, int N)
2888 for (k=0; k<N; k++) // transpose Z for efficiency of ensuing matrix multiplication
2891 double buf = V[k][l];
2897 /////////////////////////////////////////////////////////////////////////////////////////////////////////
2898 static double sqrarg;
2899 #define SQR(a) ((sqrarg=(a)) == 0.0 ? 0.0 : sqrarg*sqrarg)
2900 static double maxarg1,maxarg2;
2901 #define FMAX(a,b) (maxarg1=(a),maxarg2=(b),(maxarg1) > (maxarg2) ? (maxarg1) : (maxarg2))
2902 static int iminarg1,iminarg2;
2903 #define IMIN(a,b) (iminarg1=(a),iminarg2=(b),(iminarg1) < (iminarg2) ? (iminarg1) : (iminarg2))
2904 #define SIGN(a,b) ((b) >= 0.0 ? fabs(a) : -fabs(a))
2907 * @brief This is a version of the Golub and Reinsch algorithm for singular value decomposition for a quadratic
2908 * (n x n) matrix A. It is sped up by transposing A amd V matrices at various places in the algorithm.
2909 * On a 400x400 matrix it runs in 1.6 s or 2.3 times faster than the original (n x m) version.
2910 * On a 4993x4993 matrix it runs in 2h03 or 4.5 times faster than the original (n x m) version.
2912 * Given a matrix a[0..n-1][0..n-1], this routine computes its singular value decomposition, A = U · W · V^t .
2913 * The matrix U replaces a on output. The diagonal matrix of singular values W is out-put as a vector w[0..n-1].
2914 * The matrix V (not the transpose V^t) is output as V[0..n-1][0..n-1] ./
2917 HitList::SVD(double **A, int n, double w[], double **V)
2919 int m=n; // in general algorithm A is an (m x n) matrix instead of (n x n)
2921 double pythag(double a, double b);
2922 int flag,i,its,j,jj,k,l=1,nm=1;
2923 double anorm,c,f,g,h,s,scale,x,y,z,*rv1;
2927 // Householder reduction to bidiagonal form.
2928 if (v>=5) printf("\nHouseholder reduction to bidiagonal form\n");
2930 if (v>=4 && i%100==0) printf("i=%i\n",i);
2931 if (v>=4) fprintf(stderr,".");
2936 for (k=i;k<m;k++) scale += fabs(A[k][i]);
2940 s += A[k][i]*A[k][i];
2943 g = -SIGN(sqrt(s),f);
2947 for (s=0.0,k=i;k<m;k++) s += A[k][i]*A[k][j];
2949 for (k=i;k<m;k++) A[k][j] += f*A[k][i];
2951 for (k=i;k<m;k++) A[k][i] *= scale;
2956 if (i < m && i != n-1) {
2957 for (k=l;k<n;k++) scale += fabs(A[i][k]);
2961 s += A[i][k]*A[i][k];
2964 g = -SIGN(sqrt(s),f);
2967 for (k=l;k<n;k++) rv1[k]=A[i][k]/h;
2969 for (s=0.0,k=l;k<n;k++) s += A[j][k]*A[i][k];
2970 for (k=l;k<n;k++) A[j][k] += s*rv1[k];
2972 for (k=l;k<n;k++) A[i][k] *= scale;
2975 anorm=FMAX(anorm,(fabs(w[i])+fabs(rv1[i])));
2977 // Accumulation of right-hand transformations.
2978 if (v>=5) printf("\nAccumulation of right-hand transformations\n");
2979 TransposeMatrix(V,n);
2980 for (i=n-1;i>=0;i--) {
2981 if (v>=4 && i%100==0) printf("i=%i\n",i);
2982 if (v>=4) fprintf(stderr,".");
2985 // Double division to avoid possible underflow.
2987 V[i][j]=(A[i][j]/A[i][l])/g;
2989 for (s=0.0,k=l;k<n;k++) s += A[i][k]*V[j][k];
2990 for (k=l;k<n;k++) V[j][k] += s*V[i][k];
2993 for (j=l;j<n;j++) V[j][i]=V[i][j]=0.0;
2999 // Accumulation of left-hand transformations.
3000 if (v>=5) printf("\nAccumulation of left-hand transformations\n");
3001 TransposeMatrix(A,n);
3002 for (i=IMIN(m,n)-1;i>=0;i--) {
3003 if (v>=4 && i%100==0) printf("i=%i\n",i);
3004 if (v>=4) fprintf(stderr,".");
3007 for (j=l;j<n;j++) A[j][i]=0.0;
3011 for (s=0.0,k=l;k<m;k++) s += A[i][k]*A[j][k];
3013 for (k=i;k<m;k++) A[j][k] += f*A[i][k];
3015 for (j=i;j<m;j++) A[i][j] *= g;
3016 } else for (j=i;j<m;j++) A[i][j]=0.0;
3020 // Diagonalization of the bidiagonal form: Loop over singular values, and over allowed iterations.
3021 if (v>=5) printf("\nDiagonalization of the bidiagonal form\n");
3022 for (k=n-1;k>=0;k--) {
3023 if (v>=4 && k%100==0) printf("k=%i\n",k);
3024 if (v>=4) fprintf(stderr,".");
3025 for (its=1;its<=30;its++) {
3027 // Test for splitting. Note that rv1[1] is always zero.
3028 for (l=k;l>=0;l--) {
3030 if ((double)(fabs(rv1[l])+anorm) == anorm) {
3034 if ((double)(fabs(w[nm])+anorm) == anorm) break;
3037 // Cancellation of rv1[l], if l > 1.
3040 for (i=l;i<=k;i++) {
3043 if ((double)(fabs(f)+anorm) == anorm) break;
3061 // Singular value is made nonnegative.
3064 for (j=0;j<n;j++) V[k][j] = -V[k][j];
3068 if (its == 30) {printf("Error in SVD: no convergence in 30 iterations\n"); exit(7);}
3069 // Shift from bottom 2-by-2 minor.
3075 f=((y-z)*(y+z)+(g-h)*(g+h))/(2.0*h*y);
3077 f=((x-z)*(x+z)+h*((y/(f+SIGN(g,f)))-h))/x;
3078 // Next QR transformation:
3080 for (j=l;j<=nm;j++) {
3094 for (jj=0;jj<n;jj++) {
3101 // Rotation can be arbitrary if z = 0.
3111 for (jj=0;jj<m;jj++) {
3123 TransposeMatrix(V,n);
3124 TransposeMatrix(A,n);
3125 delete[](rv1); (rv1) = NULL;
3129 * @brief Computes (a2 + b2 )^1/2 without destructive underflow or overflow.
3132 pythag(double a, double b)
3138 return absa*sqrt(1.0+SQR(absb/absa));
3140 return (absb == 0.0 ? 0.0 : absb*sqrt(1.0+SQR(absa/absb)));
3144 /* @* HitList::ClobberGlobal(void)
3147 HitList::ClobberGlobal(void){
3150 /* @<variables local to HitList::ClobberGlobal@> */
3151 class List<Hit>::ListEl<Hit> *pvIter = head;
3153 /* NOTE: no free/delete-ing of data to be done here
3154 hitlist only holds a shallow copy of hit;
3155 hit is being cleared off properly.
3156 just reset everything to 0/0.0/NULL.
3157 The only important thing to do at this stage
3158 is to attach head and tail and set size = 0
3161 NOTE: I only ever saw 1 (one) in-between element,
3162 but there may ctually be a real linked list
3163 of more than 1 element (FS, 2010-02-18)
3166 // printf("POINTER:\t%p\t=HEAD\n", head);
3167 while (pvIter->next != tail){
3169 // printf("POINTER:\t%p->\t%p\n", pvIter, pvIter->next);
3170 pvIter = pvIter->next;
3173 pvIter->data.longname = pvIter->data.name =
3174 pvIter->data.file = pvIter->data.dbfile = NULL;
3175 pvIter->data.sname = NULL;
3176 pvIter->data.seq = NULL;
3177 pvIter->data.self = 0;
3178 pvIter->data.i = pvIter->data.j = NULL;
3179 pvIter->data.states = NULL;
3180 pvIter->data.S = pvIter->data.S_ss = pvIter->data.P_posterior = NULL;
3181 pvIter->data.Xcons = NULL;
3182 pvIter->data.sum_of_probs = 0.0;
3183 pvIter->data.Neff_HMM = 0.0;
3184 pvIter->data.score_ss = pvIter->data.Pval = pvIter->data.logPval =
3185 pvIter->data.Eval = pvIter->data.Probab = pvIter->data.Pforward = 0.0;
3186 pvIter->data.nss_conf = pvIter->data.nfirst =
3187 pvIter->data.i1 = pvIter->data.i2 = pvIter->data.j1 = pvIter->data.j2 =
3188 pvIter->data.matched_cols = pvIter->data.ssm1 = pvIter->data.ssm2 = 0;
3191 // printf("POINTER:\t\t\t%p=TAIL\n", tail);
3201 } /* this is the end of HitList::ClobberGlobal() */