binaries/src/clustalo/src/hhalign/hhhitlist-C.h

   1 /* -*- mode: c; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
   2
   3 /*********************************************************************
   4  * Clustal Omega - Multiple sequence alignment
   5  *
   6  * Copyright (C) 2010 University College Dublin
   7  *
   8  * Clustal-Omega is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU General Public License as
  10  * published by the Free Software Foundation; either version 2 of the
  11  * License, or (at your option) any later version.
  12  *
  13  * This file is part of Clustal-Omega.
  14  *
  15  ********************************************************************/
  16
  17 /*
  18  * RCS $Id: hhhitlist-C.h 243 2011-05-31 13:49:19Z fabian $
  19  */
  20
  21 // hhhitlist.C
  22
  23 #ifndef MAIN
  24 #define MAIN
  25 #include <iostream>   // cin, cout, cerr
  26 #include <fstream>    // ofstream, ifstream
  27 #include <stdio.h>    // printf
  28 #include <stdlib.h>   // exit
  29 #include <string>     // strcmp, strstr
  30 #include <math.h>     // sqrt, pow
  31 #include <limits.h>   // INT_MIN
  32 #include <float.h>    // FLT_MIN
  33 #include <time.h>     // clock
  34 #include <ctype.h>    // islower, isdigit etc
  35 using std::ios;
  36 using std::ifstream;
  37 using std::ofstream;
  38 using std::cout;
  39 using std::cerr;
  40 using std::endl;
  41 #include "util-C.h"     // imax, fmax, iround, iceil, ifloor, strint, strscn, strcut, substr, uprstr, uprchr, Basename etc.
  42 #include "list.h"     // list data structure
  43 #include "hash.h"     // hash data structure
  44 #include "hhdecl-C.h"      // constants, class
  45 #include "hhutil-C.h"      // imax, fmax, iround, iceil, ifloor, strint, strscn, strcut, substr, uprstr, uprchr, Basename etc.
  46 #include "hhhmm.h"       // class HMM
  47 #include "hhalignment.h" // class Alignment
  48 #include "hhhit.h"
  49 #include "hhhalfalignment.h"
  50 #include "hhfullalignment.h"
  51 #endif
  52
  53
  54 //////////////////////////////////////////////////////////////////////////////
  55 //////////////////////////////////////////////////////////////////////////////
  56 //// Methods of class HitList
  57 //////////////////////////////////////////////////////////////////////////////
  58 //////////////////////////////////////////////////////////////////////////////
  59
  60
  61
  62 //////////////////////////////////////////////////////////////////////////////
  63 /**
  64  * @brief Print summary listing of hits
  65  */
  66 void
  67 HitList::PrintHitList(HMM& q, char* outfile)
  68 {
  69   Hit hit;
  70   int nhits=0;
  71   char str[NAMELEN]="";
  72
  73   FILE* outf=NULL;
  74   if (strcmp(outfile,"stdout"))
  75     {
  76       outf=fopen(outfile,"w");
  77       if (!outf) OpenFileError(outfile);
  78     }
  79   else
  80     outf = stdout;
  81
  82
  83   fprintf(outf,"Query         %s\n",q.longname);
  84 //  fprintf(outf,"Family        %s\n",q.fam);
  85   fprintf(outf,"Match_columns %i\n",q.L);
  86   fprintf(outf,"No_of_seqs    %i out of %i\n",q.N_filtered,q.N_in);
  87   fprintf(outf,"Neff          %-4.1f\n",q.Neff_HMM);
  88   fprintf(outf,"Searched_HMMs %i\n",N_searched);
  89
  90   // Print date stamp
  91   time_t* tp=new(time_t);
  92   *tp=time(NULL);
  93   fprintf(outf,"Date          %s",ctime(tp));
  94   delete (tp); (tp) = NULL;
  95
  96   // Print command line
  97   fprintf(outf,"Command       ");
  98   for (int i=0; i<par.argc; i++)
  99     if (strlen(par.argv[i])<=par.maxdbstrlen)
 100       fprintf(outf,"%s ",par.argv[i]);
 101     else
 102       fprintf(outf,"<%i characters> ",(int)strlen(par.argv[i]));
 103   fprintf(outf,"\n\n");
 104
 105 #ifdef WINDOWS
 106   if (par.trans)
 107     fprintf(outf," No Hit                             Prob  E-trans  E-value  Score    SS Cols Query HMM  Template HMM\n");
 108   else
 109     fprintf(outf," No Hit                             Prob  E-value  P-value  Score    SS Cols Query HMM  Template HMM\n");
 110 #else
 111   if (par.trans)
 112     fprintf(outf," No Hit                             Prob E-trans E-value  Score    SS Cols Query HMM  Template HMM\n");
 113   else
 114     fprintf(outf," No Hit                             Prob E-value P-value  Score    SS Cols Query HMM  Template HMM\n");
 115 #endif
 116
 117   Reset();
 118   while (!End()) // print hit list
 119     {
 120       hit = ReadNext();
 121       if (nhits>=par.Z) break;       //max number of lines reached?
 122       if (nhits>=par.z && hit.Probab < par.p) break;
 123       if (nhits>=par.z && hit.Eval > par.E) continue;
 124 //       if (hit.matched_cols <=1) continue; // adding this might get to intransparent... analogous statement in PrintAlignments
 125        nhits++;
 126        sprintf(str,"%3i %-30.30s    ",nhits,hit.longname);
 127
 128
 129 #ifdef WINDOWS
 130        if (par.trans) // Transitive scoring
 131          fprintf(outf,"%-34.34s %5.1f %8.2G %8.2G %6.1f %5.1f %4i ",str,hit.Probab,hit.E1val,hit.Eval,hit.score,hit.score_ss,hit.matched_cols);
 132        else // Normal scoring
 133          fprintf(outf,"%-34.34s %5.1f %8.2G %8.2G %6.1f %5.1f %4i ",str,hit.Probab,hit.Eval,hit.Pval,hit.score,hit.score_ss,hit.matched_cols);
 134 #else
 135        if (par.trans) // Transitive scoring
 136          fprintf(outf,"%-34.34s %5.1f %7.2G %7.2G %6.1f %5.1f %4i ",str,hit.Probab,hit.E1val,hit.Eval,hit.score,hit.score_ss,hit.matched_cols);
 137        else // Normal scoring
 138          fprintf(outf,"%-34.34s %5.1f %7.2G %7.2G %6.1f %5.1f %4i ",str,hit.Probab,hit.Eval,hit.Pval,hit.score,hit.score_ss,hit.matched_cols);
 139 #endif
 140
 141       sprintf(str,"%4i-%-4i ",hit.i1,hit.i2);
 142       fprintf(outf,"%-10.10s",str);
 143       sprintf(str,"%4i-%-4i",hit.j1,hit.j2);
 144       fprintf(outf,"%-9.9s(%i)\n",str,hit.L);
 145    } //end print hit list
 146   fprintf(outf,"\n");
 147   if (strcmp(outfile,"stdout")) fclose(outf);
 148 }
 149
 150
 151
 152 //////////////////////////////////////////////////////////////////////////////
 153 /**
 154  * @brief Print alignments of query sequences against hit sequences
 155  */
 156 int
 157 HitList::PrintAlignments(
 158
 159
 160 #ifdef CLUSTALO
 161                               char **ppcFirstProf, char **ppcSecndProf,
 162 #endif
 163                               HMM& q, char* outfile, char outformat)
 164 {
 165   Hit hit;
 166   FullAlignment qt_ali(par.nseqdis+10); // maximum 10 annotation (pseudo) sequences (ss_dssp, sa_dssp, ss_pred, ss_conf, consens,...)
 167   int nhits=0;
 168
 169 #ifndef CLUSTALO_NOFILE
 170   FILE* outf=NULL;
 171   if (strcmp(outfile,"stdout"))
 172     {
 173       if (outformat==0)
 174         outf=fopen(outfile,"a"); //append to summary hitlist
 175       else
 176         outf=fopen(outfile,"w"); //open for writing
 177       if (!outf) OpenFileError(outfile);
 178     }
 179   else
 180     outf = stdout;
 181 #endif
 182
 183   Reset();
 184   while (!End()) // print hit list
 185     {
 186       if (nhits>=par.B) break;       //max number of lines reached?
 187       hit = ReadNext();
 188       if (nhits>=par.b && hit.Probab < par.p) break;
 189       if (nhits>=par.b && hit.Eval > par.E) continue;
 190       //      // adding this might get to intransparent...
 191       //      // analogous statement in PrintHitlist and hhalign.C
 192       //       if (hit.matched_cols <=1) continue;
 193       nhits++;
 194
 195       // Build double alignment of query against template sequences
 196       int iBuildRet = qt_ali.Build(q,hit);
 197       if (iBuildRet != OK){ /* FS, r241 -> r243 */
 198           fprintf(stderr, "%s:%s:%d: qt_ali.Build failed\n",
 199                  __FUNCTION__, __FILE__, __LINE__);
 200           return FAILURE;
 201       }
 202
 203 #ifndef CLUSTALO
 204       // Print out alignment
 205       if (outformat==0) // HHR format
 206         {
 207           fprintf(outf,"No %-3i\n",nhits);
 208           qt_ali.PrintHeader(outf,q,hit);
 209           qt_ali.PrintHHR(outf,hit);
 210         }
 211       else if (outformat==1) // FASTA format
 212         {
 213           fprintf(outf,"# No %-3i\n",nhits);
 214           qt_ali.PrintFASTA(outf,hit);
 215         }
 216       else if(outformat==2) // A2M format
 217         {
 218           fprintf(outf,"# No %-3i\n",nhits);
 219           qt_ali.PrintA2M(outf,hit);
 220         }
 221       else // A3m format
 222         {
 223           fprintf(outf,"# No %-3i\n",nhits);
 224           qt_ali.PrintA3M(outf,hit);
 225         }
 226 #else
 227       qt_ali.OverWriteSeqs(ppcFirstProf, ppcSecndProf);
 228 #endif
 229
 230       qt_ali.FreeMemory();
 231     }
 232 #ifndef CLUSTALO_NOFILE
 233   if (strcmp(outfile,"stdout")) fclose(outf);
 234 #endif
 235
 236   return OK;
 237
 238 } /* this is the end of HitList::PrintAlignments() */
 239
 240
 241
 242
 243
 244 ////////////////////////////////////////////////////////////////////////////
 245 /**
 246  * @brief Return the ROC_5 score for optimization
 247  * (changed 28.3.08 by Michael & Johannes)
 248  */
 249 void
 250 HitList::Optimize(HMM& q, char* buffer)
 251 {
 252   const int NFAM =5;   // calculate ROC_5 score
 253   const int NSFAM=5;   // calculate ROC_5 score
 254   int roc=0;           // ROC score
 255   int fam=0;           // number of hits from same family (at current threshold)
 256   int not_fam=0;       // number of hits not from same family
 257   int sfam=0;          // number of hits from same suporfamily (at current threshold)
 258   int not_sfam=0;      // number of hits not from same superfamily
 259   Hit hit;
 260
 261   SortList();
 262   Reset();
 263   while (!End())
 264     {
 265       hit = ReadNext();
 266       if (!strcmp(hit.fam,q.fam)) fam++;       // query and template from same superfamily? => positive
 267       else if (not_fam<NFAM) // query and template from different family? => negative
 268         {
 269           not_fam++;
 270           roc += fam;
 271         }
 272       if (!strcmp(hit.sfam,q.sfam)) sfam++;       // query and template from same superfamily? => positive
 273       else if (not_sfam<NSFAM) // query and template from different superfamily?   => negative
 274         {
 275           not_sfam++;
 276           roc += sfam;
 277         }
 278 //       printf("qfam=%s tfam=%s qsfam=%s tsfam=%s  fam=%-2i  not_fam=%3i  sfam=%-3i  not_sfam=%-5i  roc=%-3i\n",q.fam,hit.fam,q.sfam,hit.sfam,fam,not_fam,sfam,not_sfam,roc);
 279     }
 280
 281   // Write ROC score to file or stdout
 282   FILE* buf=NULL;
 283   if (strcmp(par.buffer,"stdout"))
 284     {
 285       buf=fopen(buffer,"w");
 286       if (!buf) OpenFileError(par.buffer);
 287     }
 288   else
 289     buf = stdout;
 290
 291   fprintf(buf,"%f\n",float(roc)/float(fam*NFAM+sfam*NSFAM)); // must be between 0 and 1
 292   if (v>=2) printf("ROC=%f\n",float(roc)/float(fam*NFAM+sfam*NSFAM));
 293   fclose(buf);
 294 }
 295
 296
 297
 298 //////////////////////////////////////////////////////////////////////////////
 299 /**
 300  * @brief Print score distribution into file score_dist
 301  */
 302 void
 303 HitList::PrintScoreFile(HMM& q)
 304 {
 305   int i=0, n;
 306   FILE* scoref=NULL;
 307   Hit hit;
 308   Hash<int> twice(10000); // make sure only one hit per HMM is listed
 309   twice.Null(-1);
 310
 311   if (strcmp(par.scorefile,"stdout"))
 312     {
 313       scoref=fopen(par.scorefile,"w");
 314       if (!scoref)
 315         {cerr<<endl<<"WARNING from "<<par.argv[0]<<": could not open \'"<<par.scorefile<<"\'\n"; return;}
 316     }
 317   else
 318     scoref = stdout;
 319   Reset();
 320   fprintf(scoref,"NAME  %s\n",q.longname);
 321   fprintf(scoref,"FAM   %s\n",q.fam);
 322   fprintf(scoref,"FILE  %s\n",q.file);
 323   fprintf(scoref,"LENG  %i\n",q.L);
 324   fprintf(scoref,"\n");
 325 //fprintf(scoref,"TARGET      REL LEN COL LOG-PVA   S-TOT     MS NALI\n");
 326
 327 //For hhformat, the PROBAB field has to start at position 41 !!
 328 //                ----+----1----+----2----+----3----+----4----+----
 329   fprintf(scoref,"TARGET     FAMILY   REL LEN COL LOG-PVA  S-AASS PROBAB SCORE\n");
 330   //              d153l__               5 185 185  287.82  464.22 100.00
 331   //              d1qsaa2               3 168 124  145.55  239.22  57.36
 332   while (!End())
 333     {
 334       i++;
 335       hit = ReadNext();
 336       if (twice[hit.name]==1) continue; // better hit with same HMM has been listed already
 337       twice.Add(hit.name,1);
 338      //if template and query are from the same superfamily
 339       if (!strcmp(hit.name,q.name)) n=5;
 340       else if (!strcmp(hit.fam,q.fam)) n=4;
 341       else if (!strcmp(hit.sfam,q.sfam)) n=3;
 342       else if (!strcmp(hit.fold,q.fold)) n=2;
 343       else if (!strcmp(hit.cl,q.cl)) n=1;
 344       else n=0;
 345       fprintf(scoref,"%-10s %-10s %1i %3i %3i %s %7.2f %6.2f %7.2f\n",hit.name,hit.fam,n,hit.L,hit.matched_cols,sprintg(-1.443*hit.logPval,7),-hit.score_aass,hit.Probab,hit.score);
 346     }
 347   fclose(scoref);
 348 }
 349
 350
 351 inline double
 352 logPvalue_HHblast(double s, double corr)
 353 {
 354    return -s*(1.0-0.5*corr) + (1.0-corr)*log(1.0+s);
 355 // return -s*(1.0-0.5*corr) + log( 1.0+(1.0-corr)*s );
 356 // return -s*(1.0-0.5*corr) + log( 1.0+(1.0-corr)*(1.0-0.5*corr)*s );
 357 }
 358
 359 inline double
 360 Pvalue_HHblast(double s, double corr)
 361 {
 362    return exp(-s*(1.0-0.5*corr)) * pow(1.0+s,1.0-corr);
 363 // return exp(-s*(1.0-0.5*corr)) * ( 1.0+(1.0-corr)*s );
 364 // return exp(-s*(1.0-0.5*corr)) * ( 1.0+(1.0-corr)*(1.0-0.5*corr)*s );
 365 }
 366
 367 inline double
 368 logLikelihood_HHblast(double s, double corr)
 369 {
 370   if (s<0.0) { s=0.0; if (corr<1E-5) corr=1E-5; else if (corr>0.99999) corr=0.99999; }
 371   else       {        if (corr<0.0)  corr=0.0;  else if (corr>1.0) corr=1.0; }
 372   return -s*(1.0-0.5*corr) - corr*log(1.0+s) + log(s*(1.0-0.5*corr)+0.5*corr);
 373   // return -s*(1.0-0.5*corr) + log( s*(1.0-corr)*(1.0-0.5*corr)+0.5*corr );
 374   // return -s*(1.0-0.5*corr) + log((s*(1.0-corr)*(1.0-0.5*corr)+corr)*(1.0-0.5*corr));
 375 }
 376
 377 /////////////////////////////////////////////////////////////////////////////////////
 378 /**
 379  * @brief Evaluate the *negative* log likelihood for the order statistic of the uniform distribution
 380  * for the best 10% of hits (vertex v = (corr,offset) )
 381  *  The k'th order statistic for X~Uniform is p:=X^(k)~Beta(k,n-k+1) = const*p^(k-1)*(1-p)^(n-k)
 382  * Needed to fit the correlation and score offset in HHblast
 383 */
 384 double
 385 HitList::RankOrderFitCorr(double* v)
 386 {
 387   double sum=0.0;
 388 //   printf("%8.2G  %8.2G  %i\n",v[0],v[1],Nprof);
 389   int i1 = imin(Nprof,imax(50,int(0.05*Nprof)));
 390   for (int i=0; i<i1; i++)
 391     {
 392       double p = Pvalue_HHblast(score[i]+v[1],v[0]);
 393 //       sum -= (1.0-double(i)/double(i1)) * weight[i] * ( double(i)*log(p) + (Nprof-i-1.0)*log(1.0-p) );
 394       float diff = p-(float(i)+1.0)/(Nprof+1.0);
 395       sum += (1.0-double(i)/double(i1)) * weight[i]*diff*diff*(Nprof+1.0)*(Nprof+1.0)*(Nprof+2.0)/(i+10.0)/(Nprof-i);
 396 //       printf("%-3i  Pval=%7.5f  Preal=%7.5f  diff=%7.5f  sum=%7.5f\n",i,p,float(i+1)/(1.0+Nprof),diff,sum);
 397     }
 398   return sum;
 399 }
 400
 401 /**
 402  * @brief Static wrapper-function for calling the nonstatic member function RankOrderFitCorr()
 403  * ( see http://www.newty.de/fpt/callback.html#member )
 404  */
 405 double
 406 HitList::RankOrderFitCorr_static(void* pt2hitlist, double* v)
 407 {
 408   HitList* mySelf = (HitList*) pt2hitlist;  // explicitly cast to a pointer to Hitlist
 409   return mySelf->RankOrderFitCorr(v); // call member function
 410 }
 411
 412 /////////////////////////////////////////////////////////////////////////////////////
 413 /**
 414  * @brief Evaluate the *negative* log likelihood of the data at the vertex v = (corr,offset)
 415  * Needed to fit the correlation and score offset in HHblast
 416  */
 417 double
 418 HitList::LogLikelihoodCorr(double* v)
 419 {
 420   double sum=0.0;
 421 //   printf("%8.2G  %8.2G  %i\n",v[0],v[1],Nprof);
 422   for (int i=0; i<Nprof; i++)
 423     {
 424       sum -= weight[i]*logLikelihood_HHblast(score[i]+v[1],v[0]);
 425 //    printf("%-3i  Pval=%7.5f  Preal=%7.5f  diff=%7.5f  rmsd=%7.5f  sum=%7.5f\n",i,Pvalue_HHblast(score[i],v[0]),float(i)/(1.0+Nprof),x,sqrt(sum/sumw),sum);
 426     }
 427   return sum;
 428 }
 429
 430 /**
 431  * @brief Static wrapper-function for calling the nonstatic member function LogLikelihoodCorr()
 432  * ( see http://www.newty.de/fpt/callback.html#member )
 433  */
 434 double
 435 HitList::LogLikelihoodCorr_static(void* pt2hitlist, double* v)
 436 {
 437   HitList* mySelf = (HitList*) pt2hitlist;  // explicitly cast to a pointer to Hitlist
 438   return mySelf->LogLikelihoodCorr(v); // call member function
 439 }
 440
 441 /////////////////////////////////////////////////////////////////////////////////////
 442 /**
 443  * @brief Evaluate the *negative* log likelihood of the data at the vertex v = (lamda,mu)
 444  *    p(s) = lamda * exp{ -exp[-lamda*(s-mu)] - lamda*(s-mu) } = lamda * exp( -exp(-x) - x)
 445  */
 446 double
 447 HitList::LogLikelihoodEVD(double* v)
 448 {
 449   double sum=0.0, sumw=0.0;
 450   for (int i=0; i<Nprof; i++)
 451     {
 452       double x = v[0]*(score[i]-v[1]);
 453       sum += weight[i]*(exp(-x)+x);
 454       sumw += weight[i];
 455     }
 456   return sum - sumw*log(v[0]);
 457 }
 458
 459 /**
 460  * @brief Static wrapper-function for calling the nonstatic member function LogLikelihoodEVD()
 461  * ( see http://www.newty.de/fpt/callback.html#member )
 462  */
 463 double
 464 HitList::LogLikelihoodEVD_static(void* pt2hitlist, double* v)
 465 {
 466   HitList* mySelf = (HitList*) pt2hitlist; // explicitly cast to a pointer to Hitlist
 467   return mySelf->LogLikelihoodEVD(v);                // call member function
 468 }
 469
 470 /////////////////////////////////////////////////////////////////////////////////////
 471 /**
 472  * @brief Subroutine to FindMin: try new point given by highest point ihigh and fac and replace ihigh if it is lower
 473  */
 474 double
 475 HitList::TryPoint(const int ndim, double* p, double* y, double* psum, int ihigh, double fac, double (*Func)(void* pt2hitlist, double* v))
 476 {
 477   // New point p_try = p_c + fac*(p_high-p_c),
 478   // where p_c = ( sum_i (p_i) - p_high)/ndim is the center of ndim other points
 479   // => p_try = fac1*sum_i(p_i) + fac2*p_high
 480   double fac1=(1.-fac)/ndim;
 481   double fac2=fac-fac1;
 482   double ptry[ndim];   //new point to try out
 483   double ytry;         //function value of new point
 484   int j;               //index for the ndim parameters
 485
 486   for (j=0; j<ndim; j++)
 487     ptry[j]=psum[j]*fac1+p[ihigh*ndim+j]*fac2;
 488   ytry = (*Func)(this,ptry);
 489   if (ytry<=y[ihigh])
 490     {
 491 //       if (v>=4) printf("Trying:                  %-7.3f %-7.3f %-7.3f -> accept\n",ptry[0],ptry[1],ytry);
 492       y[ihigh]=ytry;
 493       for (j=0; j<ndim; j++)
 494         {
 495           psum[j] += ptry[j]-p[ihigh*ndim+j]; //update psum[j]
 496           p[ihigh*ndim+j]=ptry[j];            //replace p[ihigh] with ptry
 497         }                                     //Note: ihigh is now not highest point anymore!
 498     }
 499 //   else if (v>=4) printf("Trying:                  %-7.3f %-7.3f %-7.3f -> reject\n",ptry[0],ptry[1],ytry);
 500
 501   return ytry;
 502 }
 503
 504
 505
 506 /////////////////////////////////////////////////////////////////////////////////////
 507 /**
 508  * @brief Find minimum with simplex method of Nelder and Mead (1965)
 509  */
 510 float
 511 HitList::FindMin(const int ndim, double* p, double* y, double tol, int& nfunc, double (*Func)(void* pt2hitlist, double* v))
 512 {
 513   const int MAXNFUNC=99; //maximum allowed number of function evaluations
 514   int ihigh;    //index of highest point on simplex
 515   int inext;    //index of second highest point on simplex
 516   int ilow;     //index of lowest point on simplex
 517   int i;        //index for the ndim+1 points
 518   int j;        //index for the ndim parameters
 519   double rtol;  //tolerance: difference of function value between highest and lowest point of simplex
 520   double temp;   //dummy
 521   double ytry;   //function value of trial point
 522   double psum[ndim]; //psum[j] = j'th coordinate of sum vector (sum over all vertex vectors)
 523
 524   nfunc=0;    //number of function evaluations =0
 525   //Calculate sum vector psum[j]
 526   for (j=0; j<ndim; j++)
 527     {
 528       psum[j]=p[j];
 529       for (i=1; i<ndim+1; i++)
 530         psum[j]+=p[i*ndim+j];
 531     }
 532
 533   // Repeat finding better points in simplex until rtol<tol
 534   while(1)
 535     {
 536       // Find indices for highest, next highest and lowest point
 537       ilow=0;
 538       if (y[0]>y[1]) {inext=1; ihigh=0;} else {inext=0; ihigh=1;}
 539       for (i=0; i<ndim+1; i++)
 540         {
 541           if (y[i]<=y[ilow]) ilow=i;
 542           if (y[i]>y[ihigh]) {inext=ihigh; ihigh=i;}
 543           else if (y[i]>y[inext] && i!= ihigh) inext=i;
 544         }
 545
 546       // If tolerance in y is smaller than tol swap lowest point to index 0 and break -> return
 547       rtol = 2.*fabs(y[ihigh]-y[ilow]) / (fabs(y[ihigh])+fabs(y[ilow])+1E-10);
 548       if (rtol<tol)
 549         {
 550           temp=y[ilow]; y[ilow]=y[0]; y[0]=temp;
 551           for (j=0; j<ndim; j++)
 552             {
 553               temp=p[ilow*ndim+j]; p[ilow*ndim+j]=p[j]; p[j]=temp;
 554             }
 555           break;
 556         }
 557
 558       // Max number of function evaluations exceeded?
 559       if (nfunc>=MAXNFUNC )
 560         {
 561           if (v) fprintf(stderr,"\nWARNING: maximum likelihood fit of score distribution did not converge.\n");
 562           return 1;
 563         }
 564
 565       nfunc+=2;
 566       // Point-reflect highest point on the center of gravity p_c of the other ndim points of the simplex
 567       if (v>=3) printf("%3i  %-7.3f  %-7.3f %-12.8f %-9.3E\n",nfunc,p[ilow*ndim],p[ilow*ndim+1],y[ilow],rtol);
 568 //       if (v>=2) printf("           %3i %-9.3E   %-7.3f %-7.3f %-7.3f   %-7.3f %-7.3f %-7.3f   %-7.3f %-7.3f %-7.3f\n",nfunc,rtol,p[ilow*ndim],p[ilow*ndim+1],y[ilow],p[inext*ndim],p[inext*ndim+1],y[inext],p[ihigh*ndim],p[ihigh*ndim+1],y[ihigh]);
 569       ytry = TryPoint(ndim,p,y,psum,ihigh,-1.0,Func); //reflect highest point on p_c
 570
 571       if (ytry<=y[ilow])
 572         {
 573           ytry = TryPoint(ndim,p,y,psum,ihigh,2.0,Func); //expand: try new point 2x further away from p_c
 574 //        if (v>=2) printf("Expanded:  %3i %-9.3E   %-7.3f %-7.3f %-7.3f   %-7.3f %-7.3f %-7.3f   %-7.3f %-7.3f %-7.3f\n",nfunc,rtol,p[ilow*ndim],p[ilow*ndim+1],y[ilow],p[inext*ndim],p[inext*ndim+1],y[inext],p[ihigh*ndim],p[ihigh*ndim+1],y[ihigh]);
 575         }
 576       else if (ytry>=y[inext])
 577         {
 578           // The new point is worse than the second worst point
 579           temp=y[ihigh];
 580           ytry=TryPoint(ndim,p,y,psum,ihigh,0.5,Func); //contract simplex by 0.5 along (p_high-p_c
 581 //        if (v>=2) printf("Compressed:%3i %-9.3E   %-7.3f %-7.3f %-7.3f   %-7.3f %-7.3f %-7.3f   %-7.3f %-7.3f %-7.3f\n",nfunc,rtol,p[ilow*ndim],p[ilow*ndim+1],y[ilow],p[inext*ndim],p[inext*ndim+1],y[inext],p[ihigh*ndim],p[ihigh*ndim+1],y[ihigh]);
 582           if (ytry>=temp)
 583             {
 584               // Trial point is larger than worst point => contract simplex by 0.5 towards lowest point
 585               for (i=0; i<ndim+1; i++)
 586                 {
 587                   if (i!=ilow)
 588                     {
 589                       for (j=0; j<ndim; j++)
 590                         p[i*ndim+j]=0.5*(p[i*ndim+j]+p[ilow+j]);
 591                       y[i] = (*Func)(this,p+i*ndim);
 592 //                    y[i] = (*Func)(p+i*ndim);
 593                     }
 594                 }
 595               nfunc+=ndim;
 596 //            if (v>=2) printf("Contracted:%3i %-9.3E   %-7.3f %-7.3f  %-7.3f  %-7.3f %-7.3f  %-7.3f  %-7.3f %-7.3f  %-7.3f\n",nfunc,rtol,p[ilow*ndim],p[ilow*ndim+1],y[ilow],p[inext*ndim],p[inext*ndim+1],y[inext],p[ihigh*ndim],p[ihigh*ndim+1],y[ihigh]);
 597
 598               //Calculate psum[j]
 599               for (j=0; j<ndim; j++)
 600                 {
 601                   psum[j]=p[j];
 602                   for (i=1; i<ndim+1; i++)
 603                     psum[j]+=p[i*ndim+j];
 604                 }
 605             }
 606         }
 607       else nfunc--;
 608     }
 609   return (float)rtol;
 610 }
 611
 612
 613
 614 /////////////////////////////////////////////////////////////////////////////////////
 615 /**
 616  * @brief Do a maximum likelihod fit of the scores with an EV distribution with parameters lamda and mu
 617  */
 618 void
 619 HitList::MaxLikelihoodEVD(HMM& q, int nbest)
 620 {
 621   double tol=1E-6;                 // Maximum relative tolerance when minimizing -log(P)/N (~likelihood)
 622   static char first_call=1;
 623   static Hash<int> size_fam(MAXPROF/10);  // Hash counts number of HMMs in family
 624   static Hash<int> size_sfam(MAXPROF/10); // Hash counts number of families in superfamily
 625   Hash<int> excluded(50);          // Hash containing names of superfamilies to be excluded from fit
 626   size_fam.Null(0);                // Set int value to return when no data can be retrieved
 627   size_sfam.Null(0);               // Set int value to return when no data can be retrieved
 628   excluded.Null(0);                // Set int value to return when no data can be retrieved
 629   Hit hit;
 630
 631   double mu;                       // EVD[mu,lam](x) = exp(-exp(-(x-mu)/lam)) = P(score<=x)
 632   double vertex[2*3];              // three vertices of the simplex in lamda-mu plane
 633   double yvertex[3];               // log likelihood values at the three vertices of the simplex
 634   int nfunc=0;                     // number of function calls
 635   double sum_weights=0.0;
 636   float sum_scores=0.0;
 637   float rtol;
 638
 639   if (first_call==1)
 640     {
 641       first_call=0;
 642       // Count how many HMMs are in each family; set number of multiple hits per template nrep
 643       if (v>=4) printf("  count number of profiles in each family and families in each superfamily ...\n");
 644       Reset();
 645       while (!End())
 646         {
 647           hit = ReadNext();
 648           if (!size_fam.Contains(hit.fam)) (*size_sfam(hit.sfam))++; //Add one to hash element for superfamily
 649          (*size_fam(hit.fam))++;              //Add one to hash element for family
 650           //      printf("size(%s)=%i name=%s\n",hit.fam,*size_fam(hit.fam),hit.name)
 651         }
 652       fams=size_fam.Size();
 653       sfams=size_sfam.Size();
 654       if (v>=3)
 655         printf("%-3i HMMs from %i families and %i superfamilies searched. Found %i hits\n",N_searched,fams,sfams,Size());
 656     }
 657
 658   // Query has SCOP family identifier?
 659   if (q.fam && q.fam[0]>='a' && q.fam[0]<='k' && q.fam[1]=='.')
 660     {
 661       char sfamid[NAMELEN];
 662       char* ptr_in_fam=q.fam;
 663       while ((ptr_in_fam=strwrd(sfamid,ptr_in_fam,'-')))
 664         {
 665           char* ptr=strrchr(sfamid,'.');
 666           if (ptr) *ptr='\0';
 667           excluded.Add(sfamid);
 668 //        fprintf(stderr,"Exclude SCOP superfamily %s  ptr_in_fam='%s'\n",sfamid,ptr_in_fam);
 669       }
 670     }
 671   // Exclude best superfamilies from fit
 672   else if (nbest>0)
 673     {
 674       if (sfams<97+nbest) return;
 675
 676       // Find the nbest best-scoring superfamilies for exclusion from first ML fit
 677       if (v>=4) printf("  find %i best-scoring superfamilies to exclude from first fit  ...\n",nbest);
 678       hit = Smallest();
 679       excluded.Add(hit.sfam);
 680 //       printf("Exclude in first round: %s %8.2f %s\n",hit.name,hit.score_aass,hit.sfam);
 681       while (excluded.Size()<nbest)
 682         {
 683           Reset();
 684           while (!End() && excluded.Contains(ReadNext().sfam)) ;
 685           hit=ReadCurrent();
 686           while (!End())
 687             {
 688               if (ReadNext()<hit && !excluded.Contains(ReadCurrent().sfam))
 689                 hit=ReadCurrent();
 690             }
 691           excluded.Add(hit.sfam);
 692 //        printf("Exclude in first round: %s %8.2f %s %i %i\n",hit.name,hit.score_aass,hit.sfam,excluded.Size(),excluded.Contains(hit.sfam));
 693         }
 694       tol = 0.01/size_sfam.Size(); // tol=1/N would lead to delta(log-likelihood)~1 (where N ~ number of superfamilies) since (1+1/N)^N = e
 695     }
 696   else
 697     {
 698       // Find the best-scoring superfamilies from first fit for exclusion from second ML fit
 699       if (v>=4) printf("  find best-scoring superfamilies to exclude from second fit  ...\n");
 700       Reset();
 701       while (!End())
 702         {
 703           hit = ReadNext();
 704           if (hit.Eval < 0.05) excluded.Add(hit.sfam); // changed from 0.5 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 705         }
 706       tol = 0.001/size_sfam.Size(); // tol=1/N would lead to delta(log-likelihood)~1 (where N ~ number of superfamilies) since (1+1/N)^N = e
 707     }
 708
 709   // Put scores into score[] and weights into weight[]
 710   if (v>=3) printf("  generate scores and weights array for ML fitting ...\n");
 711   Nprof=0;
 712   Reset();
 713   while (!End())
 714     {
 715       hit = ReadNext();
 716       if (hit.irep > 1) continue;                   //Use only best hit per template
 717       if (Nprof>=MAXPROF) break;
 718
 719       char sfamid[NAMELEN];
 720       char* ptr_in_fam=hit.fam;
 721       while ((ptr_in_fam=strwrd(sfamid,ptr_in_fam,'-')))
 722       {
 723         char* ptr=strrchr(sfamid,'.');
 724         if (ptr) *ptr='\0';
 725         if (excluded.Contains(sfamid)) break;    //HMM is among superfamilies to be excluded
 726       }
 727       if (excluded.Contains(sfamid)) {
 728         if (v>=3) fprintf(stderr,"Exclude hit %s (family %s contains %s)\n",hit.name,hit.fam,sfamid);
 729         continue;
 730       }
 731 //       ScopID(hit.cl,hit.fold,hit.sfam,hit.fam);     //Get scop superfamily code for template
 732 //       if (*hit.sfam=='\0' || excluded.Contains(hit.sfam)) continue;    // skip HMM
 733
 734       score[Nprof] = hit.score;
 735       weight[Nprof]=1./size_fam[hit.fam]/size_sfam[hit.sfam];
 736       sum_scores +=hit.score*weight[Nprof];
 737       sum_weights+=weight[Nprof];
 738
 739       //DEBUG
 740 //       if (v>=4) printf("%-10.10s   %-12.12s %-3i   %-12.12s %-3i   %6.4f   %6.4f  %7.1f\n",hit.name,hit.fam,size_fam[hit.fam],hit.sfam,size_sfam[hit.sfam],1./size_fam[hit.fam]/size_sfam[hit.sfam],sum,hit.score);
 741       Nprof++;
 742     }
 743   //DEBUG
 744   if (v>=3)
 745     printf("%i hits used for score distribution\n",Nprof);
 746    // for (int i=0; i<Nprof; i++) printf("%3i  score=%8.3f  weight=%7.5f\n",i,score[i],weight[i]);
 747
 748   // Set simplex vertices and function values
 749   mu = sum_scores/sum_weights - 0.584/LAMDA;
 750   if (par.loc) // fit only in local mode; in global mode use fixed value LAMDA and mu mean score
 751     {
 752       double (*Func)(void*, double*);
 753       Func = HitList::LogLikelihoodEVD_static;
 754
 755       if (nbest>0) {vertex[0]=LAMDA;   vertex[1]=mu;}  /////////////////////////////////////////// DEBUG
 756       else         {vertex[0]=q.lamda; vertex[1]=mu;}
 757       vertex[2]=vertex[0]+0.1; vertex[3]=vertex[1];
 758       vertex[4]=vertex[0];     vertex[5]=vertex[1]+0.2;
 759       yvertex[0]=Func(this,vertex  );
 760       yvertex[1]=Func(this,vertex+2);
 761       yvertex[2]=Func(this,vertex+4);
 762
 763       // Find lam and mu that minimize negative log likelihood of data
 764       if (v>=3) printf("Fitting to EVD by maximum likelihood...\niter lamda       mu    -log(P)/N   tol\n");
 765       rtol = FindMin(2,vertex,yvertex,tol,nfunc,Func);
 766       if (v>=3) printf("%3i  %-7.3f  %-7.2f     %-7.3f %-7.1E\n\n",nfunc,vertex[0],vertex[1],yvertex[0]-(1.5772-log(vertex[0])),rtol);
 767 //       printf("HHsearch lamda=%-6.3f   mu=%-6.3f\n",vertex[0],vertex[1]);
 768     }
 769   else
 770     {
 771       vertex[0]=LAMDA_GLOB; vertex[1]=mu;
 772     }
 773
 774   // Set lamda and mu of profile
 775   q.lamda = vertex[0];
 776   q.mu = vertex[1];
 777
 778   // Set P-values and E-values
 779   // CHECK UPDATE FROM score=-logpval to score=-logpval+SSSCORE2NATLOG*score_ss !!!!
 780   Reset();
 781   while (!End())
 782     {
 783       hit = ReadNext();
 784
 785       // Calculate total score in raw score units: P-value = 1- exp(-exp(-lamda*(Saa-mu)))
 786       hit.weight=1./size_fam[hit.fam]/size_sfam[hit.sfam];   // needed for transitive scoring
 787       hit.logPval = logPvalue(hit.score,vertex);
 788       hit.Pval=Pvalue(hit.score,vertex);
 789       hit.Eval=exp(hit.logPval+log(N_searched));
 790 //    hit.score_aass = hit.logPval/0.45-3.0 - hit.score_ss;  // median(lamda)~0.45, median(mu)~4.0 in EVDs for scop20.1.63 HMMs
 791       hit.score_aass = -q.lamda*(hit.score-q.mu)/0.45-3.0 - fmin(hit.score_ss,fmax(0.0,0.5*hit.score-5.0)); // median(lamda)~0.45, median(mu)~3.0 in EVDs for scop20.1.63 HMMs
 792       hit.Probab = Probab(hit);
 793       if (nbest>0 && par.loc) // correct length correction (not needed for second round of fitting, since lamda very similar)
 794         if (par.idummy==0) ////////////////////////////////////////////
 795           hit.score += log(q.L*hit.L)*(1/LAMDA-1/vertex[0]);
 796       hit.score_sort = hit.score_aass;
 797       Overwrite(hit);                     // copy hit object into current position of hitlist
 798
 799       if (nbest==0 && par.trans==1)       // if in transitive scoring mode (weights file given)
 800         TransitiveScoring();
 801       else if (nbest==0 && par.trans==2)  // if in transitive scoring mode (weights file given)
 802         TransitiveScoring2();
 803       else if (nbest==0 && par.trans==3)  // if in transitive scoring mode (weights file given)
 804         TransitiveScoring3();
 805       else if (nbest==0 && par.trans==4)  // if in transitive scoring mode (weights file given)
 806         TransitiveScoring4();
 807     }
 808 }
 809
 810
 811 /////////////////////////////////////////////////////////////////////////////////////
 812 /**
 813  * @brief Calculate correlation and score offset for HHblast composite E-values
 814  */
 815 void
 816 HitList::CalculateHHblastCorrelation(HMM& q)
 817 {
 818   int nfunc=0;                     // number of function calls
 819   double tol;                      // Maximum relative tolerance when minimizing -log(P)/N (~likelihood)
 820   double vertex[2*3];              // three vertices of the simplex in lamda-mu plane
 821   double yvertex[3];               // log likelihood values at the three vertices of the simplex
 822   Hit hit;
 823   Hash<int> excluded(50);          // Hash containing names of superfamilies to be excluded from fit
 824   excluded.Null(0);                // Set int value to return when no data can be retrieved
 825
 826   // Set sum of HHsearch and PSI-BLAST score for calculating correlation
 827   Reset();
 828   while (!End())
 829     {
 830       hit = ReadNext();
 831       hit.score_sort = hit.logPval + blast_logPvals->Show(hit.name); // if template not in hash, return log Pval = 0, i.e. Pvalue = 1!
 832       Overwrite(hit);   // copy hit object into current position of hitlist
 833     }
 834
 835   // Query has SCOP family identifier?
 836   if (q.fam && q.fam[0]>='a' && q.fam[0]<='k' && q.fam[1]=='.')
 837     {
 838       char sfamid[NAMELEN];
 839       char* ptr_in_fam=q.fam;
 840       while ((ptr_in_fam=strwrd(sfamid,ptr_in_fam,'-')))
 841         {
 842           char* ptr=strrchr(sfamid,'.');
 843           if (ptr) *ptr='\0';
 844           excluded.Add(sfamid);
 845           fprintf(stderr,"Exclude SCOP superfamily %s  ptr_in_fam='%s'\n",sfamid,ptr_in_fam);
 846       }
 847     }
 848
 849   // Resort list by sum of log P-values
 850   ResortList(); // use InsertSort to resort list according to sum of minus-log-Pvalues
 851   Nprof=0;
 852   Reset();
 853   ReadNext(); // skip best hit
 854   while (!End())
 855     {
 856       hit = ReadNext();
 857       if (hit.irep>=2) continue; // use only best alignments
 858 //       if (hit.Eval<0.005) {if (v>=3) printf("Fitting HHblast correlation coefficient: skipping %s with Evalue=%9.1g\n",hit.name,hit.Eval); continue;}
 859       if (Nprof>=MAXPROF) break;
 860
 861       char sfamid[NAMELEN];
 862       char* ptr_in_fam=hit.fam;
 863       while ((ptr_in_fam=strwrd(sfamid,ptr_in_fam,'-')))
 864       {
 865         char* ptr=strrchr(sfamid,'.');
 866         if (ptr) *ptr='\0';
 867         if (excluded.Contains(sfamid)) break;    //HMM is among superfamilies to be excluded
 868       }
 869       if (excluded.Contains(sfamid)) {
 870         if (v>=1) fprintf(stderr,"Exclude hit %s (family %s contains %s)\n",hit.name,hit.fam,sfamid);
 871         continue;
 872       }
 873       score[Nprof]  = -hit.score_sort;
 874       weight[Nprof] = 1.0; // = hit.weight;
 875 //       printf("%3i  %-12.12s  %7.3f + %7.3f = %7.3f \n",Nprof,hit.name,hit.logPval,blast_logPvals->Show(hit.name),-hit.score_sort); //////////////////////
 876       printf("%3i  %7.3f %7.3f\n",Nprof,hit.Pval,exp(blast_logPvals->Show(hit.name))); //////////////////////
 877       Nprof++;
 878     }
 879
 880   // Fit correlation
 881   vertex[0]=0.5;           vertex[1]=0.2;
 882   vertex[2]=vertex[0]+0.2; vertex[3]=vertex[1];
 883   vertex[4]=vertex[0];     vertex[5]=vertex[1]+0.2;
 884
 885   yvertex[0]=RankOrderFitCorr(vertex  );
 886   yvertex[1]=RankOrderFitCorr(vertex+2);
 887   yvertex[2]=RankOrderFitCorr(vertex+4);
 888 //   yvertex[0]=LogLikelihoodCorr(vertex  );
 889 //   yvertex[1]=LogLikelihoodCorr(vertex+2);
 890 //   yvertex[2]=LogLikelihoodCorr(vertex+4);
 891   tol = 1e-6;
 892   v=3;//////////////////////////////////
 893   // Find correlation and offset that minimize mean square deviation of reported composite Pvalues from actual
 894   if (v>=2) printf("Fitting correlation coefficient for HHblast...\niter  corr       offset logLikelihood  tol\n");
 895   float rtol = FindMin(2,vertex,yvertex,tol,nfunc, HitList::RankOrderFitCorr_static);
 896   if (v>=2) printf("%3i  %-7.3f  %-7.2f     %-7.3f %-7.1E\n\n",nfunc,vertex[0],vertex[1],yvertex[0],rtol);
 897   if (vertex[0]<0) vertex[0]=0.0;
 898
 899   // Print correlation and offset for profile
 900   printf("HHblast correlation=%-6.3f   score offset=%-6.3f\n",vertex[0],vertex[1]);
 901   v=2;//////////////////////////////////
 902 }
 903
 904
 905 /**
 906  * @brief Calculate HHblast composite E-values
 907  */
 908 inline double
 909 corr_HHblast(float Nq, float Nt)
 910 {
 911   return 0.5;
 912 }
 913
 914 /**
 915  * @brief Calculate HHblast composite E-values
 916  */
 917 inline double
 918 offset_HHblast(float Nq, float Nt)
 919 {
 920   return 0.0;
 921 }
 922
 923 //////////////////////////////////////////////////////////////////////////////
 924 /**
 925  * @brief Calculate HHblast composite E-values
 926  */
 927 void
 928 HitList::CalculateHHblastEvalues(HMM& q)
 929 {
 930   Hit hit;
 931   float corr, offset;  // correlation coefficient and offset for calculating composite HHblast P-values
 932
 933   Reset();
 934   while (!End())
 935     {
 936       hit = ReadNext();
 937       corr = corr_HHblast(q.Neff_HMM,hit.Neff_HMM);
 938       offset = offset_HHblast(q.Neff_HMM,hit.Neff_HMM);
 939       hit.score_sort = hit.logPval + blast_logPvals->Show(hit.name);
 940       hit.logPval = logPvalue_HHblast(-hit.score_sort+offset,corr); // overwrite logPval from HHsearch with composite logPval from HHblast
 941       hit.Pval = Pvalue_HHblast(-hit.score_sort+offset,corr);  // overwrite P-value from HHsearch with composite P-value from HHblast
 942       hit.Eval = exp(hit.logPval+log(N_searched));             // overwrite E-value from HHsearch with composite E-value from HHblast
 943       hit.Probab = Probab(hit);
 944       Overwrite(hit);   // copy hit object into current position of hitlist
 945     }
 946   ResortList(); // use InsertSort to resort list according to sum of minus-log-Pvalues
 947 }
 948
 949
 950 //////////////////////////////////////////////////////////////////////////////
 951 /**
 952  * @brief Read file generated by blastpgp (default output) and store P-values in hash
 953  */
 954 void
 955 HitList::ReadBlastFile(HMM& q)
 956 {
 957   char line[LINELEN]="";    // input line
 958   int Ndb;       // number of sequences in database
 959   int Ldb=0;     // size of database in number of amino acids
 960   char* templ;
 961   int i;
 962   if (!blast_logPvals) { blast_logPvals = new(Hash<float>); blast_logPvals->New(16381,0); }
 963
 964   FILE* blaf = NULL;
 965   if (!strcmp(par.blafile,"stdin")) blaf=stdin;
 966   else
 967     {
 968       blaf = fopen(par.blafile,"rb");
 969       if (!blaf) OpenFileError(par.blafile);
 970     }
 971
 972   // Read number of sequences and size of database
 973   while (fgetline(line,LINELEN-1,blaf) && !strstr(line,"sequences;"));
 974   if (!strstr(line,"sequences;")) FormatError(par.blafile,"No 'Database:' string found.");
 975   char* ptr=line;
 976   Ndb = strint(ptr);
 977   if (Ndb==INT_MIN) FormatError(par.blafile,"No integer for number of sequences in database found.");
 978   while ((i=strint(ptr))>INT_MIN) Ldb = 1000*Ldb + i;
 979   if (Ldb==0) FormatError(par.blafile,"No integer for size of database found.");
 980   printf("\nNumber of sequences in database = %i    Size of database = %i\n",Ndb,Ldb);
 981
 982   // Read all E-values and sequence lengths
 983   while (fgetline(line,LINELEN-1,blaf))
 984     {
 985       if (line[0]=='>')
 986         {
 987           // Read template name
 988           templ = new(char[255]);
 989           ptr = line+1;
 990           strwrd(templ,ptr);
 991           if (!blast_logPvals->Contains(templ)) // store logPval only for best HSP with template
 992             {
 993               // Read length
 994               while (fgetline(line,LINELEN-1,blaf) && !strstr(line,"Length ="));
 995               ptr = line+18;
 996               int length = strint(ptr);
 997               // Read E-value
 998               fgetline(line,LINELEN-1,blaf);
 999               fgetline(line,LINELEN-1,blaf);
1000               float EvalDB; // E-value[seq-db]  = Evalue for comparison Query vs. database, from PSI-BLAST
1001               float EvalQT; // E-value[seq-seq] = Evalue for comparison Query vs. template (seq-seq)
1002               double logPval;
1003               ptr = strstr(line+20,"Expect =");
1004               if (!ptr) FormatError(par.blafile,"No 'Expect =' string found.");
1005               if (sscanf(ptr+8,"%g",&EvalDB)<1)
1006                 {
1007                   ptr[7]='1';
1008                   if (sscanf(ptr+7,"%g",&EvalDB)<1)
1009                     FormatError(par.blafile,"No Evalue found after 'Expect ='.");
1010                 }
1011               // Calculate P-value[seq-seq] = 1 -  exp(-E-value[seq-seq]) = 1 - exp(-Lt/Ldb*E-value[seq-db])
1012               EvalQT = length/double(Ldb)*double(EvalDB);
1013               if (EvalQT>1E-3) logPval = log(1.0-exp(-EvalQT)); else logPval=log(double(EvalQT)+1.0E-99);
1014               blast_logPvals->Add(templ,logPval);
1015               printf("template=%-10.10s  length=%-3i  EvalDB=%8.2g  EvalQT=%8.2g  P-value=%8.2g log Pval=%8.2g\n",templ,length,EvalDB,EvalQT,exp(logPval),logPval);
1016             }
1017           else {
1018             delete[] templ; templ = NULL;
1019           }
1020         }
1021     }
1022   fclose(blaf);
1023 }
1024
1025
1026 /////////////////////////////////////////////////////////////////////////////////////
1027 /**
1028  * @brief Calculate output of hidden neural network units
1029  */
1030 inline float
1031 calc_hidden_output(const float* weights, const float* bias, float Lqnorm, float Ltnorm, float Nqnorm, float Ntnorm)
1032 {
1033   float res;
1034   // Calculate activation of hidden unit = sum of all inputs * weights + bias
1035   res = Lqnorm*weights[0] + Ltnorm*weights[1] + Nqnorm*weights[2] + Ntnorm*weights[3] + *bias;
1036   res = 1.0 / (1.0 + exp(-(res ))); // logistic function
1037   return res;
1038 }
1039
1040 ////////////////////////////////////////////////////////////////////////////////////
1041 /**
1042  * @brief Neural network regressions of lamda for EVD
1043  */
1044 inline float
1045 lamda_NN(float Lqnorm, float Ltnorm, float Nqnorm, float Ntnorm)
1046 {
1047   const int inputs = 4;
1048   const int hidden = 4;
1049   const float biases[] = {-0.73195, -1.43792, -1.18839, -3.01141}; // bias for all hidden units
1050   const float weights[] = { // Weights for the neural networks (column = start unit, row = end unit)
1051     -0.52356, -3.37650, 1.12984, -0.46796,
1052     -4.71361, 0.14166, 1.66807, 0.16383,
1053     -0.94895, -1.24358, -1.20293, 0.95434,
1054     -0.00318, 0.53022, -0.04914, -0.77046,
1055     2.45630, 3.02905, 2.53803, 2.64379
1056   };
1057   float lamda=0.0;
1058   for (int h = 0; h<hidden; h++) {
1059     lamda += calc_hidden_output( weights+inputs*h, biases+h, Lqnorm,Ltnorm,Nqnorm,Ntnorm ) * weights[hidden*inputs+h];
1060   }
1061   return lamda;
1062 }
1063
1064 ////////////////////////////////////////////////////////////////////////////////////
1065 /**
1066  * @brief Neural network regressions of mu for EVD
1067  */
1068 inline float
1069 mu_NN(float Lqnorm, float Ltnorm, float Nqnorm, float Ntnorm)
1070 {
1071   const int inputs = 4;
1072   const int hidden = 6;
1073   const float biases[] = {-4.25264, -3.63484, -5.86653, -4.78472, -2.76356, -2.21580};  // bias for all hidden units
1074   const float weights[] = { // Weights for the neural networks (column = start unit, row = end unit)
1075     1.96172, 1.07181, -7.41256, 0.26471,
1076     0.84643, 1.46777, -1.04800, -0.51425,
1077     1.42697, 1.99927, 0.64647, 0.27834,
1078     1.34216, 1.64064, 0.35538, -8.08311,
1079     2.30046, 1.31700, -0.46435, -0.46803,
1080     0.90090, -3.53067, 0.59212, 1.47503,
1081     -1.26036, 1.52812, 1.58413, -1.90409, 0.92803, -0.66871
1082   };
1083   float mu=0.0;
1084   for (int h = 0; h<hidden; h++) {
1085     mu += calc_hidden_output( weights+inputs*h, biases+h, Lqnorm,Ltnorm,Nqnorm,Ntnorm ) * weights[hidden*inputs+h];
1086   }
1087   return 20.0*mu;
1088 }
1089
1090 //////////////////////////////////////////////////////////////////////////////
1091 /**
1092  * @brief Calculate Pvalues as a function of query and template lengths and diversities
1093  */
1094 void
1095 HitList::CalculatePvalues(HMM& q)
1096 {
1097   Hit hit;
1098   float lamda=0.4, mu=3.0;
1099   const float log1000=log(1000.0);
1100
1101   if (par.idummy!=2)
1102     {
1103       printf("WARNING: idummy should have been ==2 (no length correction)\n");
1104       exit(4);
1105     }
1106
1107   if(N_searched==0) N_searched=1;
1108   if (v>=2)
1109     printf("Calculate Pvalues as a function of query and template lengths and diversities...\n");
1110   Reset();
1111   while (!End())
1112     {
1113       hit = ReadNext();
1114
1115       if (par.loc)
1116         {
1117           lamda = lamda_NN( log(q.L)/log1000, log(hit.L)/log1000, q.Neff_HMM/10.0, hit.Neff_HMM/10.0 );
1118           mu    =    mu_NN( log(q.L)/log1000, log(hit.L)/log1000, q.Neff_HMM/10.0, hit.Neff_HMM/10.0 );
1119 //        if (v>=3 && nhits++<20)
1120 //           printf("hit=%-10.10s Lq=%-4i  Lt=%-4i  Nq=%5.2f  Nt=%5.2f  =>  lamda=%-6.3f  mu=%-6.3f\n",hit.name,q.L,hit.L,q.Neff_HMM,hit.Neff_HMM,lamda,mu);
1121         }
1122       else
1123         {
1124           printf("WARNING: global calibration not yet implemented!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
1125         }
1126       hit.logPval = logPvalue(hit.score,lamda,mu);
1127       hit.Pval    = Pvalue(hit.score,lamda,mu);
1128       hit.Eval=exp(hit.logPval+log(N_searched));
1129 //    hit.score_aass = hit.logPval/LAMDA-3.0 - hit.score_ss;  // median(lamda)~0.45, median(mu)~3.0 in EVDs for scop20.1.63 HMMs
1130       // P-value = 1- exp(-exp(-lamda*(Saa-mu))) => -lamda*(Saa-mu) = log(-log(1-Pvalue))
1131       hit.score_aass = (hit.logPval<-10.0? hit.logPval : log(-log(1-hit.Pval)) )/0.45 - fmin(lamda*hit.score_ss,fmax(0.0,0.2*(hit.score-8.0)))/0.45 - 3.0;
1132       hit.score_sort = hit.score_aass;
1133       hit.Probab = Probab(hit);
1134       Overwrite(hit);
1135     }
1136   SortList();
1137   Reset();
1138   return;
1139 }
1140
1141 //////////////////////////////////////////////////////////////////////////////
1142 /**
1143  * @brief  Calculate Pvalues from calibration of  0: query HMM, 1:template HMMs, 2: both
1144  */
1145 void
1146 HitList::GetPvalsFromCalibration(HMM& q)
1147 {
1148   Hit hit;
1149   char warn=0;
1150   if(N_searched==0) N_searched=1;
1151   if (v>=2)
1152     {
1153       switch (par.calm)
1154         {
1155         case 0:
1156           printf("Using lamda=%-5.3f and mu=%-5.2f from calibrated query HMM %s. \n",q.lamda,q.mu,q.name);
1157           printf("Note that HMMs need to be recalibrated when changing HMM-HMM alignment options.\n");
1158           break;
1159         case 1:
1160           printf("Using score distribution parameters lamda and mu from database HMMs \n");
1161           break;
1162         case 2:
1163           printf("Combining score distribution parameters lamda and mu from query and database HMMs\n");
1164           printf("Note that HMMs need to be recalibrated when changing HMM-HMM alignment options.\n");
1165           break;
1166         }
1167     }
1168   Reset();
1169   while (!End())
1170     {
1171       hit = ReadNext();
1172       if (par.calm==0 || (hit.logPvalt==0) )
1173         {
1174           hit.logPval = logPvalue(hit.score,q.lamda,q.mu);
1175           hit.Pval    = Pvalue(hit.score,q.lamda,q.mu);
1176           if (par.calm>0 && warn++<1 && v>=1)
1177             printf("Warning: some template HMM (e.g. %s) are not calibrated. Using query calibration.\n",hit.name);
1178         }
1179       else if (par.calm==1)
1180         {
1181           hit.logPval = hit.logPvalt;
1182           hit.Pval    = hit.Pvalt;
1183         }
1184       else if (par.calm==2)
1185         {
1186           hit.logPval = 0.5*( logPvalue(hit.score,q.lamda,q.mu) + hit.logPvalt);
1187           hit.Pval    = sqrt( Pvalue(hit.score,q.lamda,q.mu) * hit.Pvalt);
1188           if (v>=5) printf("Score: %7.1f  lamda: %7.1f  mu: %7.1f  P-values:  query-calibrated: %8.2G   template-calibrated: %8.2G   geometric mean: %8.2G\n",hit.score,q.lamda,q.mu,Pvalue(hit.score,q.lamda,q.mu),hit.Pvalt,hit.Pval);
1189         }
1190
1191       hit.Eval=exp(hit.logPval+log(N_searched));
1192 //    hit.score_aass = hit.logPval/LAMDA-3.0 - hit.score_ss;  // median(lamda)~0.45, median(mu)~3.0 in EVDs for scop20.1.63 HMMs
1193       // P-value = 1- exp(-exp(-lamda*(Saa-mu))) => -lamda*(Saa-mu) = log(-log(1-Pvalue))
1194       hit.score_aass = (hit.logPval<-10.0? hit.logPval : log(-log(1-hit.Pval)) ) / 0.45-3.0 - fmin(hit.score_ss,fmax(0.0,0.5*hit.score-5.0));
1195       hit.score_sort = hit.score_aass;
1196       hit.Probab = Probab(hit);
1197       Overwrite(hit);
1198     }
1199   SortList();
1200   Reset();
1201   return;
1202 }
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212 //////////////////////////////////////////////////////////////////////////////
1213 //////////////////////////////////////////////////////////////////////////////
1214 //////////////////////////////////////////////////////////////////////////////
1215 // Transitive scoring
1216 //////////////////////////////////////////////////////////////////////////////
1217 //////////////////////////////////////////////////////////////////////////////
1218 //////////////////////////////////////////////////////////////////////////////
1219
1220
1221
1222
1223
1224
1225
1226 /////////////////////////////////////////////////////////////////////////////////////
1227 /**
1228  * @brief Calculate P-values and Probabilities from transitive scoring over whole database
1229  */
1230 void
1231 HitList::TransitiveScoring()
1232 {
1233   void PrintMatrix(float** V, int N);
1234   void PrintMatrix(double** V, int N);
1235
1236   float** Z;    // matrix of intra-db Z-scores Z_kl
1237   float** C;    // covariance matrix for Z_k: C_kl = sum_m=1^N (Z_km * Z_lm)
1238   char** fold;  // fold name of HMM k
1239   char** fam;   // family of HMM k
1240   float* Prob;  // probability of HMM k
1241   float* Zq;    // Zq[k] = Z-score between query and database HMM k
1242   float* Ztq;   // Ztq[k] = transitive Z-score from query to database HMM k: Ztq[k] = sum_l[ w_ql * Z_lk] / normalization_q
1243   float* Zrq;   // Zrq[k] = transitive Z-score from database HMM k to query: Zrq[k] = sum_l[ w_kl * Z_lq] / normalization_k
1244   float* w;     // unnormalized weight matrix; w[l] is w_ql or w_kl, respectively
1245   int* ll;      // ll[m] is the m'th index l for which Z_lq, Z_lk > Zmin_trans
1246   int N;        // dimension of weight matrix is NxN
1247   int M;        // number of HMMs l with Z_ql>Ztrans_min (or Z_lk>Ztrans_min, respectively)
1248   int k,l,m,n;  // indices for database HMMs
1249   char name[NAMELEN];
1250   Hash<int> index(MAXPROF+7);  // index{name} = index of HMM name in {1,...,N}
1251   index.Null(-1);              // Set int value to return when no data can be retrieved
1252   Hash<int> excluded(13);      // Hash containing names of superfamilies to be excluded from fit
1253   excluded.Null(0);            // Set int value to return when no data can be retrieved
1254   Hit hit;
1255   size_t unused; /* disable fread gcc warning */
1256
1257   // Read weights matrix W with index hash and names array
1258   fprintf(stderr,"Reading in weights file\n");
1259   FILE* wfile = fopen(par.wfile,"rb");
1260   if (v>=1 && wfile==NULL)
1261     {
1262       fprintf(stderr,"Error: %s could not be opened: (N_searched=%i) ",par.wfile,N_searched);
1263       perror("fopen");
1264       fprintf(stderr,"Skipping caclulation of transitive P-values\n");
1265       par.trans=0;
1266       return;
1267     }
1268   unused = fread(&N,sizeof(int),1,wfile);  // read matrix dimension (i.e. number of HMMs in database)
1269   if (v>=1 && N!=N_searched)
1270     {
1271       fprintf(stderr,"Error: Number %i of HMMs in weight file is different from number %i of HMMs in searched databases. \n",N,N_searched);
1272       fprintf(stderr,"Skipping caclulation of transitive P-values\n");
1273       par.trans=0;
1274       return;
1275     }
1276   if (v>=2) fprintf(stderr,"Calculating transitive P-values for %i HMMs\n",N);
1277   // Read names of HMMs (to specify mapping of HMM to matrix indices)
1278   for (k=0; k<N; k++)
1279     {
1280       unused = fread(name,sizeof(char),IDLEN,wfile);
1281       index.Add(name,k);
1282     }
1283   // Read symmetric Z-scores matrix
1284   Z = new(float*[N]);
1285   for (k=0; k<N; k++)
1286     {
1287       Z[k] = new(float[N]);
1288       for (l=0; l<k; l++) Z[k][l] = Z[l][k];
1289       unused = fread(Z[k]+k,sizeof(float),N-k,wfile);
1290     }
1291   // Read symmetric covariance matrix
1292   C = new(float*[N]);
1293   for (k=0; k<N; k++)
1294     {
1295       C[k] = new(float[N]);
1296       for (l=0; l<k; l++) C[k][l] = C[l][k];
1297       unused = fread(C[k]+k,sizeof(float),N-k,wfile);
1298     }
1299   fclose(wfile);
1300
1301   // Allocate memory
1302   Zq = new(float[N]);
1303   Ztq = new(float[N]);
1304   Zrq = new(float[N]);
1305   fold = new(char*[N]);
1306   fam = new(char*[N]);
1307   Prob = new(float[N]);
1308   ll = new(int[N]);
1309   w = new(float[N]);
1310
1311   // Transform P-values to normally distributed Z-scores and store in Zq vector
1312   fprintf(stderr,"Transform P-values to Z-scores\n");
1313   float Zmax_neg   = Score2Z( -log(MINEVALEXCL) + log(N_searched) ); // calculate Z-score corresponding to E-value MINEVALEXCL
1314   float Zmin_trans = Score2Z( -log(par.Emax_trans) + log(N_searched) ); // calculate Z-score corresponding to E-value par.Emax_trans
1315   printf("Zmax = %6.2f   Zmin = %6.2f \n",Zmax_neg,Zmin_trans);
1316
1317   Reset();
1318   while (!End())
1319     {
1320       hit = ReadNext();
1321       if (hit.irep>1) continue;
1322       k = index.Show(hit.name);
1323       if (k<0) {fprintf(stderr,"Error: no index found in weights file for domain %s\n",hit.name); exit(1);}
1324       if (hit.logPvalt<0)
1325         Zq[k] = 0.5*Score2Z(fabs(hit.logPval)) + 0.5*Score2Z(fabs(hit.logPvalt));  // Zq[k] = 0.5*(Zkq + Zqk)
1326       else
1327         Zq[k] = Score2Z(fabs(hit.logPval));                           // Zq[k] = Zqk
1328 //      printf("%4i  %-10.10s logPvalt=%9g  Zq=%9f\n",k,hit.name,hit.logPvalt,Zq[k]);
1329 //       if (isnan(Zq[k])) {
1330 //      fprintf(stderr,"Error: a floating point exception occurred. Skipping transitive scoring\n");
1331 //      printf("%4i  %-10.10s logPval=%9g  logPvalt=%9g  Zq=%9f\n",k,hit.name,hit.logPval,hit.logPvalt,Zq[k]);
1332 //      par.trans=0;
1333 //      return;
1334 //       }
1335       if (Zq[k]>Zmax_neg) excluded.Add(hit.fold);
1336       fold[k] = new(char[IDLEN]);
1337       fam[k] = new(char[IDLEN]);
1338       strcpy(fold[k],hit.fold);
1339       strcpy(fam[k],hit.fam);
1340       weight[k] = hit.weight;
1341       Prob[k] = hit.Probab;
1342    }
1343
1344   if (v>=3)
1345     {
1346       excluded.Reset();
1347       while (!excluded.End())
1348         {
1349           excluded.ReadNext(name);
1350           printf("Excluded fold %s from fitting to Ztq\n",name);
1351         }
1352     }
1353
1354
1355   ////////////////////////////////////////////////////////////////
1356   // Calculate transitive score (query->l) Zt[l]
1357
1358   // Construct vector ll of indices l for which Z_lq > Zmin_trans
1359   m = 0;
1360   for (l=0; l<N; l++)
1361     if (Zq[l]>=Zmin_trans) ll[m++]=l;
1362   M = m;  // number of indices l for which Z_lq,Z_lk > Zmin_trans
1363
1364 //   for (m=0; m<M; m++)
1365 //     fprintf(stderr,"m=%-4i l=%-4i  %-10.10s  Zq[l]=%7f\n",m,ll[m],fam[ll[m]],Zq[ll[m]]);
1366
1367   if (M<=1)
1368     for (k=0; k<N; k++) Ztq[k]=0.0;
1369   else
1370     {
1371       // Generate submatrix of C for indices l for which Z_lq,Z_lk > Zmin_trans
1372       double** Csub = new(double*[M]);
1373       double** Cinv = new(double*[M]);
1374       for (m=0; m<M; m++)
1375         {
1376           Csub[m] = new(double[M]);
1377           Cinv[m] = new(double[M]);
1378           for (n=0; n<M; n++)
1379             Csub[m][n] = double(C[ll[m]][ll[n]]);
1380         }
1381
1382       if (v>=3)
1383         {
1384           fprintf(stderr,"Covariance matrix\n");
1385           PrintMatrix(Csub,M);
1386         }
1387
1388       // Invert Csub
1389       fprintf(stderr,"Calculate inverse of covariance submatrix\n");
1390       InvertMatrix(Cinv,Csub,M);
1391
1392       if (v>=3)
1393         {
1394           fprintf(stderr,"Inverse covariance matrix\n");
1395           PrintMatrix(Cinv,M);
1396         }
1397
1398       // Calculate weights w[l]
1399       for (m=0; m<M; m++)
1400         {
1401           double sum = 0.0;
1402           for (n=0; n<M; n++)
1403             sum += 1.0 * Cinv[m][n];
1404           w[m] = fmax(sum,0.0);
1405         }
1406       for (l=0; l<M; l++){
1407         delete[](Cinv[l]); (Cinv[l]) = NULL;
1408       }
1409       delete[](Cinv); (Cinv) = NULL;
1410
1411       // Calculate Ztq[k] for all HMMs k
1412       fprintf(stderr,"Calculate Ztq vector of transitive Z-scores\n");
1413       float norm = NormalizationFactor(Csub,w,M);
1414       for (k=0; k<N; k++)
1415         {
1416           double sumZ = 0.0;
1417           for (m=0; m<M; m++)
1418             sumZ += w[m] * Z[ll[m]][k];
1419           Ztq[k] = sumZ/norm;
1420         }
1421
1422       for (l=0; l<M; l++){
1423         delete[](Csub[l]); (Csub[l]) = NULL;
1424       }
1425       delete[](Csub); (Csub) = NULL;
1426     }
1427
1428   ////////////////////////////////////////////////////////////////
1429   // Calculate reverse transitive score (l->query-) Zrq[l]
1430
1431   fprintf(stderr,"Calculate Zrq vector of transitive Z-scores\n");
1432   for (k=0; k<N; k++)
1433     {
1434       // Construct vector ll of indices l for which Z_lk > Zmin_tran
1435       m = 0;
1436       for (l=0; l<N; l++)
1437           if (Z[l][k]+Z[k][l]>=2*Zmin_trans) ll[m++]=l;
1438       int M = m;  // number of indices l for which Z_lq,Z_lk > Zmin_tran
1439
1440
1441 //    fprintf(stderr,"\nfam[k]: %s\n",fam[k]);
1442 //    for (m=0; m<M; m++)
1443 //    printf(stderr,"m=%-4i k=%-4i  l=%-4i  %-10.10s  Zq[l]=%7f  Z_lk=%7f  \n",m,k,ll[m],fold[ll[m]],Zq[ll[m]],Z[k][ll[m]]);
1444
1445       if (M<=1)
1446         {
1447           Zrq[k] = Zq[k];
1448         }
1449       else
1450         {
1451           // Generate submatrix of C for indices l for which Z_lq,Z_lk > Zmin_trans
1452           double** Csub = new(double*[M]);
1453           for (m=0; m<M; m++)
1454             {
1455               Csub[m] = new(double[M]);
1456               for (n=0; n<M; n++)
1457                 Csub[m][n] = double(C[ll[m]][ll[n]]);
1458             }
1459 //        fprintf(stderr,"Covariance matrix\n");
1460 //        PrintMatrix(Csub,M);
1461
1462           if (M==2)
1463             {
1464               for (m=0; m<M; m++) w[m] = 1.0/M;
1465             }
1466           else
1467             {
1468
1469               double** Cinv = new(double*[M]);
1470               for (m=0; m<M; m++) Cinv[m] = new(double[M]);
1471
1472               // Invert Csub
1473               InvertMatrix(Cinv,Csub,M);
1474
1475               //         fprintf(stderr,"Inverse covariance matrix\n");
1476               //         PrintMatrix(Cinv,M);
1477
1478               // Calculate weights w[l]
1479               for (m=0; m<M; m++)
1480                 {
1481                   double sum = 0.0;
1482                   for (n=0; n<M; n++)
1483                     sum += 1.0 * Cinv[m][n];
1484                   w[m] = fmax(sum,0.0);
1485                 }
1486
1487 //            for (m=0; m<M; m++) fprintf(stderr,"w[%i]=%8.2g\n",m,w[m]);
1488
1489               for (l=0; l<M; l++){
1490                 delete[](Cinv[l]); (Cinv[l]) = NULL;
1491               }
1492               delete[](Cinv); (Cinv) = NULL;
1493             }
1494
1495           // Calculate Zrq[k] and normalize
1496           float norm = NormalizationFactor(Csub,w,M);
1497           double sumZ = 0.0;
1498           for (m=0; m<M; m++)
1499             sumZ += w[m] * Zq[ll[m]];
1500           Zrq[k] = sumZ/norm;
1501
1502           for (l=0; l<M; l++){
1503             delete[](Csub[l]); (Csub[l]) = NULL;
1504           }
1505           delete[](Csub); (Csub) = NULL;
1506         }
1507
1508 //    fprintf(stderr,"\nZq[k]=%8.2g  Zq1[k]=%8.2g\n",Zq[k],Zrq[k]);
1509     }
1510
1511   // Total Z-score = weighted sum over original Z-score, forward transitive and reverse transitive Z-score
1512   for (k=0; k<N; k++)
1513     {
1514       float Zqtot =  Zq[k] + par.wtrans*(Ztq[k]+Zrq[k]);
1515 //       if (isnan(Zqtot))
1516 //      {
1517 //        fprintf(stderr,"Error: a floating point exception occurred. Skipping transitive scoring\n");
1518 //        printf("%4i  %-10.10s Zq=%6.2f  Ztq=%6.2f  Zrq=%6.2f  Zqtot=%6.2f\n",k,fam[k],Zq[k],Ztq[k],Zrq[k],Zqtot);
1519 //        par.trans=0;
1520 //        return;
1521 //      }
1522       if (v>=2 &&  Zq[k] + Zqtot > 2*Zmin_trans) {
1523         printf("%4i  %-10.10s Zq=%6.2f  Ztq=%6.2f  Zrq=%6.2f  -> Zqtot=%6.2f\n",k,fam[k],Zq[k],Ztq[k],Zrq[k],Zqtot);
1524       }
1525       Ztq[k] = Zqtot;
1526     }
1527
1528   // Calculate mean and standard deviation of Z1q
1529   fprintf(stderr,"Calculate mean and standard deviation of Ztq\n");
1530   double sumw=0.0;
1531   double sumZ=0.0;
1532   double sumZ2=0.0;
1533   for (k=0; k<N; k++)
1534     {
1535       if (excluded.Contains(fold[k])) continue;
1536       sumw  += weight[k];
1537       sumZ  += weight[k]*Ztq[k];
1538       sumZ2 += weight[k]*Ztq[k]*Ztq[k];
1539 //       if (isnan(sumZ))
1540 //      {
1541 //        fprintf(stderr,"Error: a floating point exception occurred. Skipping transitive scoring\n");
1542 //        printf("%4i  %-10.10s Zq=%9f  Zrq=%9f  Ztq=%9f\n",k,fam[k],Zq[k],Zrq[k],Ztq[k]);
1543 //        par.trans=0;
1544 //        return;
1545 //      }
1546     }
1547   float mu = sumZ/sumw;
1548   float sigma = sqrt(sumZ2/sumw-mu*mu);
1549   if (v>=2) printf("mu(Ztq)=%6.3f  sigma(Ztq)=%6.2f\n",mu,sigma);
1550   sigma *= 1.01;// correct different fitting of EVD and normal variables
1551
1552   // Normalize Ztq and calculate P1-values
1553   fprintf(stderr,"Normalize Ztq and calculate P1-values\n");
1554   Reset();
1555   while (!End())
1556     {
1557       hit = ReadNext();
1558       hit.logPval = -Z2Score((Ztq[index.Show(hit.name)]-mu)/sigma);
1559       hit.E1val = N_searched*(hit.logPval<-100.0? 0.0 : exp(hit.logPval));
1560       // P-value = 1- exp(-exp(-lamda*(Saa-mu))) => -lamda*(Saa-mu) = log(-log(1-Pvalue))
1561       hit.score_aass = (hit.logPval<-10.0? hit.logPval : log(-log(1-exp(hit.logPval))) ) / 0.45-3.0 - hit.score_ss;
1562       hit.Probab = Probab(hit);
1563       hit.score_sort = hit.logPval;
1564       Overwrite(hit);                                        // copy hit object into current position of hitlist
1565    }
1566
1567   for (k=0; k<N; k++){
1568     delete[](Z[k]); (Z[k]) = NULL;
1569   }
1570   for (k=0; k<N; k++){
1571     delete[](C[k]); (C[k]) = NULL;
1572   }
1573   for (k=0; k<N; k++){
1574     delete[](fold[k]); (fold[k]) = NULL;
1575   }
1576   for (k=0; k<N; k++){
1577     delete[](fam[k]); (fam[k]) = NULL;
1578   }
1579   delete[](C); (C) = NULL;
1580   delete[](Z); (Z) = NULL;
1581   delete[](fold); (fold) = NULL;
1582   delete[](fam); (fam) = NULL;
1583   delete[](Prob); (Prob) = NULL;
1584   delete[](ll); (ll) = NULL;
1585   delete[](Zq); (Zq) = NULL;
1586   delete[](Ztq); (Ztq) = NULL;
1587 }
1588
1589
1590
1591 //////////////////////////////////////////////////////////////////////////////
1592 /**
1593  * @brief Calculate P-values and Probabilities from transitive scoring over whole database
1594  */
1595 void
1596 HitList::TransitiveScoring2()
1597 {
1598   void PrintMatrix(float** V, int N);
1599   void PrintMatrix(double** V, int N);
1600
1601   float** Z;    // matrix of intra-db Z-scores Z_kl
1602   float** C;    // covariance matrix for Z_k: C_kl = sum_m=1^N (Z_km * Z_lm)
1603   char** fold;  // fold name of HMM k
1604   char** fam;   // family of HMM k
1605   float* Prob;  // probability of HMM k
1606   float* Zq;    // Zq[k] = Z-score between query and database HMM k
1607   float* Ztq;   // Ztq[k] = transitive Z-score from query to database HMM k: Ztq[k] = sum_l[ w_ql * Z_lk] / normalization_q
1608   float* Zrq;   // Zrq[k] = transitive Z-score from database HMM k to query: Zrq[k] = sum_l[ w_kl * Z_lq] / normalization_k
1609   float* w;     // unnormalized weight matrix; w[l] is w_ql or w_kl, respectively
1610   int* ll;      // ll[m] is the m'th index l for which Z_lq, Z_lk > Zmin_trans
1611   int N;        // dimension of weight matrix is NxN
1612   int M;        // number of HMMs l with Z_ql>Ztrans_min (or Z_lk>Ztrans_min, respectively)
1613   int k,l,m,n;  // indices for database HMMs
1614   char name[NAMELEN];
1615   Hash<int> index(MAXPROF+7);  // index{name} = index of HMM name in {1,...,N}
1616   index.Null(-1);              // Set int value to return when no data can be retrieved
1617   Hash<int> excluded(13);      // Hash containing names of superfamilies to be excluded from fit
1618   excluded.Null(0);            // Set int value to return when no data can be retrieved
1619   Hit hit;
1620   size_t unused; /* disable fread gcc warning */
1621
1622   // Read weights matrix W with index hash and names array
1623   fprintf(stderr,"Reading in weights file\n");
1624   FILE* wfile = fopen(par.wfile,"rb");
1625   if (v>=1 && wfile==NULL)
1626     {
1627       fprintf(stderr,"Error: %s could not be opened: (N_searched=%i) ",par.wfile,N_searched);
1628       perror("fopen");
1629       fprintf(stderr,"Skipping caclulation of transitive P-values\n");
1630       par.trans=0;
1631       return;
1632     }
1633   unused = fread(&N,sizeof(int),1,wfile);  // read matrix dimension (i.e. number of HMMs in database)
1634   if (v>=1 && N!=N_searched)
1635     {
1636       fprintf(stderr,"Error: Number %i of HMMs in weight file is different from number %i of HMMs in searched databases. \n",N,N_searched);
1637       fprintf(stderr,"Skipping caclulation of transitive P-values\n");
1638       par.trans=0;
1639       return;
1640     }
1641   if (v>=2) fprintf(stderr,"Calculating transitive P-values for %i HMMs\n",N);
1642   // Read names of HMMs (to specify mapping of HMM to matrix indices)
1643   for (k=0; k<N; k++)
1644     {
1645       unused = fread(name,sizeof(char),IDLEN,wfile);
1646       index.Add(name,k);
1647     }
1648   // Read symmetric Z-scores matrix
1649   Z = new(float*[N]);
1650   for (k=0; k<N; k++)
1651     {
1652       Z[k] = new(float[N]);
1653       for (l=0; l<k; l++) Z[k][l] = Z[l][k];
1654       unused = fread(Z[k]+k,sizeof(float),N-k,wfile);
1655     }
1656   // Read symmetric covariance matrix
1657   C = new(float*[N]);
1658   for (k=0; k<N; k++)
1659     {
1660       C[k] = new(float[N]);
1661       for (l=0; l<k; l++) C[k][l] = C[l][k];
1662       unused = fread(C[k]+k,sizeof(float),N-k,wfile);
1663     }
1664   fclose(wfile);
1665
1666   // Allocate memory
1667   Zq = new(float[N]);
1668   Ztq = new(float[N]);
1669   Zrq = new(float[N]);
1670   fold = new(char*[N]);
1671   fam = new(char*[N]);
1672   Prob = new(float[N]);
1673   ll = new(int[N]);
1674   w = new(float[N]);
1675
1676   // Transform P-values to normally distributed Z-scores and store in Zq vector
1677   fprintf(stderr,"Transform P-values to Z-scores\n");
1678   float Zmax_neg   = Score2Z( -log(MINEVALEXCL) + log(N_searched) ); // calculate Z-score corresponding to E-value MINEVALEXCL
1679   float Zmin_trans = Score2Z( -log(par.Emax_trans) + log(N_searched) ); // calculate Z-score corresponding to E-value par.Emax_trans
1680   printf("Zmax = %6.2f   Zmin = %6.2f \n",Zmax_neg,Zmin_trans);
1681
1682   Reset();
1683   while (!End())
1684     {
1685       hit = ReadNext();
1686       if (hit.irep>1) continue;
1687       k = index.Show(hit.name);
1688       if (k<0) {fprintf(stderr,"Error: no index found in weights file for domain %s\n",hit.name); exit(1);}
1689       if (hit.logPvalt<0)
1690         Zq[k] = 0.5*Score2Z(fabs(hit.logPval)) + 0.5*Score2Z(fabs(hit.logPvalt));  // Zq[k] = 0.5*(Zkq + Zqk)
1691       else
1692         Zq[k] = Score2Z(fabs(hit.logPval));                           // Zq[k] = Zqk
1693 //      printf("%4i  %-10.10s logPvalt=%9g  Zq=%9f\n",k,hit.name,hit.logPvalt,Zq[k]);
1694 //      if (isnan(Zq[k]))
1695 //       {
1696 //      fprintf(stderr,"Error: a floating point exception occurred. Skipping transitive scoring\n");
1697 //      printf("%4i  %-10.10s logPval=%9g  logPvalt=%9g  Zq=%9f\n",k,hit.name,hit.logPval,hit.logPvalt,Zq[k]);
1698 //      par.trans=0;
1699 //      return;
1700 //       }
1701       if (Zq[k]>Zmax_neg) excluded.Add(hit.fold);
1702       fold[k] = new(char[IDLEN]);
1703       fam[k] = new(char[IDLEN]);
1704       strcpy(fold[k],hit.fold);
1705       strcpy(fam[k],hit.fam);
1706       weight[k] = hit.weight;
1707       Prob[k] = hit.Probab;
1708    }
1709
1710   if (v>=3)
1711     {
1712       excluded.Reset();
1713       while (!excluded.End())
1714         {
1715           excluded.ReadNext(name);
1716           printf("Excluded fold %s from fitting to Ztq\n",name);
1717         }
1718     }
1719
1720
1721   ////////////////////////////////////////////////////////////////
1722   // Calculate transitive score (query->l) Zt[l]
1723
1724   // Construct vector ll of indices l for which Z_lq > Zmin_trans
1725   m = 0;
1726   for (l=0; l<N; l++)
1727     if (Zq[l]>=Zmin_trans) ll[m++]=l;
1728   M = m;  // number of indices l for which Z_lq,Z_lk > Zmin_trans
1729
1730 //   for (m=0; m<M; m++)
1731 //     fprintf(stderr,"m=%-4i l=%-4i  %-10.10s  Zq[l]=%7f\n",m,ll[m],fam[ll[m]],Zq[ll[m]]);
1732
1733   if (M<=1)
1734     for (k=0; k<N; k++) Ztq[k]=0.0;
1735   else
1736     {
1737       // Generate submatrix of C for indices l for which Z_lq,Z_lk > Zmin_trans
1738       double** Csub = new(double*[M]);
1739       double** Cinv = new(double*[M]);
1740       for (m=0; m<M; m++)
1741         {
1742           Csub[m] = new(double[M]);
1743           Cinv[m] = new(double[M]);
1744           for (n=0; n<M; n++)
1745             Csub[m][n] = double(C[ll[m]][ll[n]]);
1746         }
1747
1748       if (v>=3)
1749         {
1750           fprintf(stderr,"Covariance matrix\n");
1751           PrintMatrix(Csub,M);
1752         }
1753
1754 //       // Invert Csub
1755 //       fprintf(stderr,"Calculate inverse of covariance submatrix\n");
1756 //       InvertMatrix(Cinv,Csub,M);
1757
1758 //       if (v>=3)
1759 //      {
1760 //        fprintf(stderr,"Inverse covariance matrix\n");
1761 //        PrintMatrix(Cinv,M);
1762 //      }
1763
1764
1765       // Calculate weights w[l]
1766       for (m=0; m<M; m++)
1767         {
1768           double sum = 0.0;
1769           for (n=0; n<M; n++)
1770             sum += 1.0 * Csub[m][n];
1771           printf("w[%4i] = %-8.5f\n",ll[m],1.0/sum);
1772           w[m] = (sum>0? Zq[ll[m]] / sum : 0.0);
1773         }
1774       for (l=0; l<M; l++){
1775         delete[](Cinv[l]); (Cinv[l]) = NULL;
1776       }
1777       delete[](Cinv); (Cinv) = NULL;
1778
1779       // Calculate Ztq[k] for all HMMs k
1780       fprintf(stderr,"Calculate Ztq vector of transitive Z-scores\n");
1781       float norm = NormalizationFactor(Csub,w,M);
1782       for (k=0; k<N; k++)
1783         {
1784           double sumZ = 0.0;
1785           for (m=0; m<M; m++)
1786             sumZ += w[m] * Z[ll[m]][k];
1787           Ztq[k] = sumZ/norm;
1788         }
1789
1790       for (l=0; l<M; l++){
1791         delete[](Csub[l]); (Csub[l]) = NULL;
1792       }
1793       delete[](Csub); (Csub) = NULL;
1794     }
1795
1796   ////////////////////////////////////////////////////////////////
1797   // Calculate reverse transitive score (l->query-) Zrq[l]
1798
1799   fprintf(stderr,"Calculate Zrq vector of transitive Z-scores\n");
1800   for (k=0; k<N; k++)
1801     {
1802       // Construct vector ll of indices l for which Z_lk > Zmin_tran
1803       m = 0;
1804       for (l=0; l<N; l++)
1805           if (Z[l][k]+Z[k][l]>=2*Zmin_trans) ll[m++]=l;
1806       int M = m;  // number of indices l for which Z_lq,Z_lk > Zmin_tran
1807
1808
1809 //    fprintf(stderr,"\nfam[k]: %s\n",fam[k]);
1810 //    for (m=0; m<M; m++)
1811 //    printf(stderr,"m=%-4i k=%-4i  l=%-4i  %-10.10s  Zq[l]=%7f  Z_lk=%7f  \n",m,k,ll[m],fold[ll[m]],Zq[ll[m]],Z[k][ll[m]]);
1812
1813       if (M<=1)
1814         {
1815           Zrq[k] = Zq[k];
1816         }
1817       else
1818         {
1819           // Generate submatrix of C for indices l for which Z_lq,Z_lk > Zmin_trans
1820           double** Csub = new(double*[M]);
1821           for (m=0; m<M; m++)
1822             {
1823               Csub[m] = new(double[M]);
1824               for (n=0; n<M; n++)
1825                 Csub[m][n] = double(C[ll[m]][ll[n]]);
1826             }
1827 //        fprintf(stderr,"Covariance matrix\n");
1828 //        PrintMatrix(Csub,M);
1829
1830           if (M<=2)
1831             {
1832               for (m=0; m<M; m++) w[m] = 1.0/M;
1833             }
1834           else
1835             {
1836
1837               double** Cinv = new(double*[M]);
1838               for (m=0; m<M; m++) Cinv[m] = new(double[M]);
1839
1840 //            // Invert Csub
1841 //            InvertMatrix(Cinv,Csub,M);
1842
1843 // //         fprintf(stderr,"Inverse covariance matrix\n");
1844 // //         PrintMatrix(Cinv,M);
1845
1846               // Calculate weights w[l]
1847               for (m=0; m<M; m++)
1848                 {
1849                   double sum = 0.0;
1850                   for (n=0; n<M; n++)
1851                     sum += 1.0 * Csub[m][n];
1852                   w[m] = (sum>0? Z[ll[m]][k] / sum : 0.0);
1853                 }
1854
1855 //            for (m=0; m<M; m++) fprintf(stderr,"w[%i]=%8.2g\n",m,w[m]);
1856
1857               for (l=0; l<M; l++){
1858                 delete[](Cinv[l]); (Cinv[l]) = NULL;
1859               }
1860               delete[](Cinv); (Cinv) = NULL;
1861             }
1862
1863           // Calculate Zrq[k] and normalize
1864           float norm = NormalizationFactor(Csub,w,M);
1865           double sumZ = 0.0;
1866           for (m=0; m<M; m++)
1867             sumZ += w[m] * Zq[ll[m]];
1868           Zrq[k] = sumZ/norm;
1869
1870           for (l=0; l<M; l++){
1871             delete[](Csub[l]); (Csub[l]) = NULL;
1872           }
1873           delete[](Csub); (Csub) = NULL;
1874         }
1875
1876 //    fprintf(stderr,"\nZq[k]=%8.2g  Zq1[k]=%8.2g\n",Zq[k],Zrq[k]);
1877     }
1878
1879   // Total Z-score = weighted sum over original Z-score, forward transitive and reverse transitive Z-score
1880   for (k=0; k<N; k++)
1881     {
1882       float Zqtot =  Zq[k] + par.wtrans*(Ztq[k]+Zrq[k]);
1883 //        if (isnan(Zqtot))
1884 //      {
1885 //        fprintf(stderr,"Error: a floating point exception occurred. Skipping transitive scoring\n");
1886 //        printf("%4i  %-10.10s Zq=%6.2f  Ztq=%6.2f  Zrq=%6.2f  Zqtot=%6.2f\n",k,fam[k],Zq[k],Ztq[k],Zrq[k],Zqtot);
1887 //        par.trans=0;
1888 //        return;
1889 //      }
1890       if (v>=2 &&  Zq[k] + Zqtot > 2*Zmin_trans) {
1891         printf("%4i  %-10.10s Zq=%6.2f  Ztq=%6.2f  Zrq=%6.2f  -> Zqtot=%6.2f\n",k,fam[k],Zq[k],Ztq[k],Zrq[k],Zqtot);
1892       }
1893       Ztq[k] = Zqtot;
1894     }
1895
1896   // Calculate mean and standard deviation of Z1q
1897   fprintf(stderr,"Calculate mean and standard deviation of Ztq\n");
1898   double sumw=0.0;
1899   double sumZ=0.0;
1900   double sumZ2=0.0;
1901   for (k=0; k<N; k++)
1902     {
1903       if (excluded.Contains(fold[k])) continue;
1904       sumw  += weight[k];
1905       sumZ  += weight[k]*Ztq[k];
1906       sumZ2 += weight[k]*Ztq[k]*Ztq[k];
1907 //       if (isnan(sumZ))
1908 //      {
1909 //        fprintf(stderr,"Error: a floating point exception occurred. Skipping transitive scoring\n");
1910 //        printf("%4i  %-10.10s Zq=%9f  Zrq=%9f  Ztq=%9f\n",k,fam[k],Zq[k],Zrq[k],Ztq[k]);
1911 //        par.trans=0;
1912 //        return;
1913 //      }
1914     }
1915   float mu = sumZ/sumw;
1916   float sigma = sqrt(sumZ2/sumw-mu*mu);
1917   if (v>=2) printf("mu(Ztq)=%6.3f  sigma(Ztq)=%6.2f\n",mu,sigma);
1918   sigma *= 1.01;// correct different fitting of EVD and normal variables
1919
1920   // Normalize Ztq and calculate P1-values
1921   fprintf(stderr,"Normalize Ztq and calculate P1-values\n");
1922   Reset();
1923   while (!End())
1924     {
1925       hit = ReadNext();
1926       hit.logPval = -Z2Score((Ztq[index.Show(hit.name)]-mu)/sigma);
1927       hit.E1val = N_searched*(hit.logPval<-100? 0.0 : exp(hit.logPval));
1928       // P-value = 1- exp(-exp(-lamda*(Saa-mu))) => -lamda*(Saa-mu) = log(-log(1-Pvalue))
1929       hit.score_aass = (hit.logPval<-10.0? hit.logPval : log(-log(1-exp(hit.logPval))) ) / 0.45-3.0 - hit.score_ss;
1930       hit.Probab = Probab(hit);
1931       hit.score_sort = hit.logPval;
1932       Overwrite(hit);                                        // copy hit object into current position of hitlist
1933    }
1934
1935   for (k=0; k<N; k++){
1936     delete[](Z[k]); (Z[k]) = NULL;
1937   }
1938   for (k=0; k<N; k++){
1939     delete[](C[k]); (C[k]) = NULL;
1940   }
1941   for (k=0; k<N; k++){
1942     delete[](fold[k]); (fold[k]) = NULL;
1943   }
1944   for (k=0; k<N; k++){
1945     delete[](fam[k]); (fam[k]) = NULL;
1946   }
1947   delete[](C); (C) = NULL;
1948   delete[](Z); (Z) = NULL;
1949   delete[](fold); (fold) = NULL;
1950   delete[](fam); (fam) = NULL;
1951   delete[](Prob); (Prob) = NULL;
1952   delete[](ll); (ll) = NULL;
1953   delete[](Zq); (Zq) = NULL;
1954   delete[](Ztq); (Ztq) = NULL;
1955 }
1956
1957
1958 /////////////////////////////////////////////////////////////////////////////////////
1959 /**
1960  * @brief Calculate P-values and Probabilities from transitive scoring over whole database
1961  * Like TransitiveScoring(),
1962  * but in transitive scoring, Z1_qk = sum_l w_l*Z_lk,  use all  l:E_ql<=E_qk
1963  * and in reverse    scoring, Z1_kr = sum_l w_l*Z_lq,  use all  l:E_kl<=E_kq
1964  */
1965 void
1966 HitList::TransitiveScoring3()
1967 {
1968   void PrintMatrix(float** V, int N);
1969   void PrintMatrix(double** V, int N);
1970
1971   float** Z;    // matrix of intra-db Z-scores Z_kl
1972   float** C;    // covariance matrix for Z_k: C_kl = sum_m=1^N (Z_km * Z_lm)
1973   char** fold;  // fold name of HMM k
1974   char** fam;   // family of HMM k
1975   float* Prob;  // probability of HMM k
1976   float* Zq;    // Zq[k] = Z-score between query and database HMM k
1977   float* Ztq;   // Ztq[k] = transitive Z-score from query to database HMM k: Ztq[k] = sum_l[ w_ql * Z_lk] / normalization_q
1978   float* Zrq;   // Zrq[k] = transitive Z-score from database HMM k to query: Zrq[k] = sum_l[ w_kl * Z_lq] / normalization_k
1979   float* w;     // unnormalized weight matrix; w[l] is w_ql or w_kl, respectively
1980   int* ll;      // ll[m] is the m'th index l for which Z_lq, Z_lk > Zmin_trans
1981   int N;        // dimension of weight matrix is NxN
1982   int M;        // number of HMMs l with Z_ql>Ztrans_min (or Z_lk>Ztrans_min, respectively)
1983   int k,l,m,n;  // indices for database HMMs
1984   char name[NAMELEN];
1985   Hash<int> index(MAXPROF+7);  // index{name} = index of HMM name in {1,...,N}
1986   index.Null(-1);              // Set int value to return when no data can be retrieved
1987   Hash<int> excluded(13);      // Hash containing names of superfamilies to be excluded from fit
1988   excluded.Null(0);            // Set int value to return when no data can be retrieved
1989   Hit hit;
1990   size_t unused; /* disable fread gcc warning */
1991
1992   // Read weights matrix W with index hash and names array
1993   fprintf(stderr,"Reading in weights file\n");
1994   FILE* wfile = fopen(par.wfile,"rb");
1995   if (v>=1 && wfile==NULL)
1996     {
1997       fprintf(stderr,"Error: %s could not be opened: (N_searched=%i) ",par.wfile,N_searched);
1998       perror("fopen");
1999       fprintf(stderr,"Skipping caclulation of transitive P-values\n");
2000       par.trans=0;
2001       return;
2002     }
2003   unused = fread(&N,sizeof(int),1,wfile);  // read matrix dimension (i.e. number of HMMs in database)
2004   if (v>=1 && N!=N_searched)
2005     {
2006       fprintf(stderr,"Error: Number %i of HMMs in weight file is different from number %i of HMMs in searched databases. \n",N,N_searched);
2007       fprintf(stderr,"Skipping caclulation of transitive P-values\n");
2008       par.trans=0;
2009       return;
2010     }
2011   if (v>=2) fprintf(stderr,"Calculating transitive P-values for %i HMMs\n",N);
2012   // Read names of HMMs (to specify mapping of HMM to matrix indices)
2013   for (k=0; k<N; k++)
2014     {
2015       unused = fread(name,sizeof(char),IDLEN,wfile);
2016       index.Add(name,k);
2017     }
2018   // Read symmetric Z-scores matrix
2019   Z = new(float*[N]);
2020   for (k=0; k<N; k++)
2021     {
2022       Z[k] = new(float[N]);
2023       for (l=0; l<k; l++) Z[k][l] = Z[l][k];
2024       unused = fread(Z[k]+k,sizeof(float),N-k,wfile);
2025     }
2026   // Read symmetric covariance matrix
2027   C = new(float*[N]);
2028   for (k=0; k<N; k++)
2029     {
2030       C[k] = new(float[N]);
2031       for (l=0; l<k; l++) C[k][l] = C[l][k];
2032       unused = fread(C[k]+k,sizeof(float),N-k,wfile);
2033     }
2034   fclose(wfile);
2035
2036   // Allocate memory
2037   Zq = new(float[N]);
2038   Ztq = new(float[N]);
2039   Zrq = new(float[N]);
2040   fold = new(char*[N]);
2041   fam = new(char*[N]);
2042   Prob = new(float[N]);
2043   ll = new(int[N]);
2044   w = new(float[N]);
2045
2046   // Transform P-values to normally distributed Z-scores and store in Zq vector
2047   fprintf(stderr,"Transform P-values to Z-scores\n");
2048   float Zmax_neg   = Score2Z( -log(MINEVALEXCL) + log(N_searched) ); // calculate Z-score corresponding to E-value MINEVALEXCL
2049   float Zmin_trans = Score2Z( -log(par.Emax_trans) + log(N_searched) ); // calculate Z-score corresponding to E-value par.Emax_trans
2050   printf("Zmax = %6.2f   Zmin = %6.2f \n",Zmax_neg,Zmin_trans);
2051
2052   Reset();
2053   while (!End())
2054     {
2055       hit = ReadNext();
2056       if (hit.irep>1) continue;
2057       k = index.Show(hit.name);
2058       if (k<0) {fprintf(stderr,"Error: no index found in weights file for domain %s\n",hit.name); exit(1);}
2059       if (hit.logPvalt<0)
2060         Zq[k] = 0.5*Score2Z(fabs(hit.logPval)) + 0.5*Score2Z(fabs(hit.logPvalt));  // Zq[k] = 0.5*(Zkq + Zqk)
2061       else
2062         Zq[k] = Score2Z(fabs(hit.logPval));                           // Zq[k] = Zqk
2063 //      printf("%4i  %-10.10s logPvalt=%9g  Zq=%9f\n",k,hit.name,hit.logPvalt,Zq[k]);
2064 //       if (isnan(Zq[k]))
2065 //      {
2066 //        fprintf(stderr,"Error: a floating point exception occurred. Skipping transitive scoring\n");
2067 //        printf("%4i  %-10.10s logPval=%9g  logPvalt=%9g  Zq=%9f\n",k,hit.name,hit.logPval,hit.logPvalt,Zq[k]);
2068 //        par.trans=0;
2069 //        return;
2070 //      }
2071       if (Zq[k]>Zmax_neg) excluded.Add(hit.fold);
2072       fold[k] = new(char[IDLEN]);
2073       fam[k] = new(char[IDLEN]);
2074       strcpy(fold[k],hit.fold);
2075       strcpy(fam[k],hit.fam);
2076       weight[k] = hit.weight;
2077       Prob[k] = hit.Probab;
2078    }
2079
2080   if (v>=3)
2081     {
2082       excluded.Reset();
2083       while (!excluded.End())
2084         {
2085           excluded.ReadNext(name);
2086           printf("Excluded fold %s from fitting to Ztq\n",name);
2087         }
2088     }
2089
2090
2091   ////////////////////////////////////////////////////////////////
2092   // Calculate transitive score (query->l) Ztq[l]
2093
2094   fprintf(stderr,"Calculate Ztq vector of transitive Z-scores\n");
2095   for (k=0; k<N; k++)
2096     {
2097       // Construct vector ll of indices l for which Z_lq OR Z_lk >= max(Z_kq,Zmin_trans)
2098       float Zmink = fmax(Zq[k],Zmin_trans);
2099       for (m=l=0; l<N; l++)
2100         if (Zq[l]>=Zmink) ll[m++]=l;
2101       M = m;  // number of indices l for which  Z_lq OR Z_lk >= max(Z_kq,Zmin_trans)
2102
2103 //    for (m=0; m<M; m++)
2104 //    fprintf(stderr,"m=%-4i l=%-4i  %-10.10s  Zq[l]=%7f\n",m,ll[m],fam[ll[m]],Zq[ll[m]]);
2105
2106       if (M<=1)
2107         {
2108           Ztq[k]=Zq[k];
2109         }
2110       else
2111         {
2112           // Generate submatrix of C for indices l for which Z_lq,Z_lk > Zmin_trans
2113           double** Csub = new(double*[M]);
2114           double** Cinv = new(double*[M]);
2115           for (m=0; m<M; m++)
2116             {
2117               Csub[m] = new(double[M]);
2118               Cinv[m] = new(double[M]);
2119               for (n=0; n<M; n++)
2120                 Csub[m][n] = double(C[ll[m]][ll[n]]);
2121             }
2122
2123 //        fprintf(stderr,"Covariance matrix\n");
2124 //        PrintMatrix(Csub,M);
2125
2126           // Invert Csub
2127 //        fprintf(stderr,"Calculate inverse of covariance submatrix\n");
2128           InvertMatrix(Cinv,Csub,M);
2129
2130 //        fprintf(stderr,"Inverse covariance matrix\n");
2131 //        PrintMatrix(Cinv,M);
2132
2133           // Calculate weights w[l]
2134           for (m=0; m<M; m++)
2135             {
2136               double sum = 0.0;
2137               for (n=0; n<M; n++)
2138                 sum += 1.0 * Cinv[m][n]; // signal ~ sum_l w_l*Z_lq !
2139               w[m] = fmax(sum,0.0);
2140             }
2141           for (l=0; l<M; l++){
2142             delete[](Cinv[l]); (Cinv[l]) = NULL;
2143           }
2144           delete[](Cinv); (Cinv) = NULL;
2145
2146           // Calculate Ztq[k]
2147           float norm = NormalizationFactor(Csub,w,M);
2148           double sumZ = 0.0;
2149           for (m=0; m<M; m++)
2150             sumZ += w[m] * fmin(Zq[ll[m]],Z[ll[m]][k]);
2151 //          sumZ += w[m] * Z[ll[m]][k];
2152           Ztq[k] = sumZ/norm;
2153
2154           for (l=0; l<M; l++){
2155             delete[](Csub[l]); (Csub[l]) = NULL;
2156           }
2157           delete[](Csub); (Csub) = NULL;
2158         }
2159     }
2160
2161   ////////////////////////////////////////////////////////////////
2162   // Calculate reverse transitive score (l->query-) Zrq[l]
2163
2164   fprintf(stderr,"Calculate Zrq vector of transitive Z-scores\n");
2165   for (k=0; k<N; k++)
2166     {
2167       // Construct vector ll of indices l for which Z_lk > Zmin_tran
2168       float Zmink = fmax(Zq[k],Zmin_trans);
2169       for (m=l=0; l<N; l++)
2170         if (Z[l][k]>=Zmink) ll[m++]=l;
2171       int M = m;  // number of indices l for which Z_lq,Z_lk > Zmin_tran
2172
2173
2174 //    fprintf(stderr,"\nfam[k]: %s\n",fam[k]);
2175 //    for (m=0; m<M; m++)
2176 //    printf(stderr,"m=%-4i k=%-4i  l=%-4i  %-10.10s  Zq[l]=%7f  Z_lk=%7f  \n",m,k,ll[m],fold[ll[m]],Zq[ll[m]],Z[k][ll[m]]);
2177
2178      if (M<=1)
2179         {
2180           Zrq[k] = Zq[k];
2181         }
2182       else
2183         {
2184           // Generate submatrix of C for indices l for which Z_lq,Z_lk > Zmin_trans
2185           double** Csub = new(double*[M]);
2186           for (m=0; m<M; m++)
2187             {
2188               Csub[m] = new(double[M]);
2189               for (n=0; n<M; n++)
2190                 Csub[m][n] = double(C[ll[m]][ll[n]]);
2191             }
2192 //        fprintf(stderr,"Covariance matrix\n");
2193 //        PrintMatrix(Csub,M);
2194
2195           if (M==2)
2196             {
2197               for (m=0; m<M; m++) w[m] = 1.0/M;
2198             }
2199           else
2200             {
2201
2202               double** Cinv = new(double*[M]);
2203               for (m=0; m<M; m++) Cinv[m] = new(double[M]);
2204
2205               // Invert Csub
2206               InvertMatrix(Cinv,Csub,M);
2207
2208               //         fprintf(stderr,"Inverse covariance matrix\n");
2209               //         PrintMatrix(Cinv,M);
2210
2211               // Calculate weights w[l]
2212               for (m=0; m<M; m++)
2213                 {
2214                   double sum = 0.0;
2215                   for (n=0; n<M; n++)
2216                     sum += 1.0 * Cinv[m][n]; // signal ~ sum_l w_l*Z_lq !
2217                   w[m] = fmax(sum,0.0);
2218                 }
2219 //            for (m=0; m<M; m++) fprintf(stderr,"w[%i]=%8.2g\n",m,w[m]);
2220               for (l=0; l<M; l++){
2221                 delete[](Cinv[l]); (Cinv[l]) = NULL;
2222               }
2223               delete[](Cinv); (Cinv) = NULL;
2224             }
2225
2226           // Calculate Zrq[k] and normalize
2227           float norm = NormalizationFactor(Csub,w,M);
2228           double sumZ = 0.0;
2229           for (m=0; m<M; m++)
2230             sumZ += w[m] * fmin(Zq[ll[m]],Z[ll[m]][k]);
2231 //          sumZ += w[m] * Zq[ll[m]];
2232           Zrq[k] = sumZ/norm;
2233
2234           for (l=0; l<M; l++){
2235             delete[](Csub[l]); (Csub[l]) = NULL;
2236           }
2237           delete[](Csub); (Csub) = NULL;
2238         }
2239
2240 //    fprintf(stderr,"\nZq[k]=%8.2g  Zq1[k]=%8.2g\n",Zq[k],Zrq[k]);
2241     }
2242
2243   // Total Z-score = weighted sum over original Z-score, forward transitive and reverse transitive Z-score
2244   for (k=0; k<N; k++)
2245     {
2246
2247       float Zqtot = Zq[k] + par.wtrans*(Ztq[k]+Zrq[k]);
2248 //       if (isnan(Zqtot))
2249 //      {
2250 //        fprintf(stderr,"Error: a floating point exception occurred. Skipping transitive scoring\n");
2251 //        printf("%4i  %-10.10s Zq=%6.2f  Ztq=%6.2f  Zrq=%6.2f  ->  Zqtot=%6.2f\n",k,fam[k],Zq[k],Ztq[k],Zrq[k],Zqtot);
2252 //        par.trans=0;
2253 //        return;
2254 //      }
2255       if (v>=3 && Zqtot > 2*Zmin_trans) {
2256         printf("%4i  %-10.10s Zq=%6.2f  Ztq=%6.2f  Zrq=%6.2f  ->  Zqtot=%6.2f\n",k,fam[k],Zq[k],Ztq[k],Zrq[k],Zqtot);
2257       }
2258       Ztq[k] = Zqtot;
2259     }
2260
2261   // Calculate mean and standard deviation of Z1q
2262   fprintf(stderr,"Calculate mean and standard deviation of Ztq\n");
2263   double sumw=0.0;
2264   double sumZ=0.0;
2265   double sumZ2=0.0;
2266   for (k=0; k<N; k++)
2267     {
2268       if (excluded.Contains(fold[k])) continue;
2269       sumw  += weight[k];
2270       sumZ  += weight[k]*Ztq[k];
2271       sumZ2 += weight[k]*Ztq[k]*Ztq[k];
2272 //       if (isnan(sumZ))
2273 //      {
2274 //        fprintf(stderr,"Error: a floating point exception occurred. Skipping transitive scoring\n");
2275 //        printf("%4i  %-10.10s Zq=%9f  Zrq=%9f  Ztq=%9f\n",k,fam[k],Zq[k],Zrq[k],Ztq[k]);
2276 //        par.trans=0;
2277 //        return;
2278 //      }
2279     }
2280   float mu = sumZ/sumw;
2281   float sigma = sqrt(sumZ2/sumw-mu*mu);
2282   if (v>=2) printf("mu(Ztq)=%6.3f  sigma(Ztq)=%6.2f\n",mu,sigma);
2283   sigma *= 1.01;// correct different fitting of EVD and normal variables
2284
2285   // Normalize Ztq and calculate P1-values
2286   fprintf(stderr,"Normalize Ztq and calculate P1-values\n");
2287   Reset();
2288   while (!End())
2289     {
2290       hit = ReadNext();
2291       hit.logPval = -Z2Score((Ztq[index.Show(hit.name)]-mu)/sigma);
2292       hit.E1val = N_searched*(hit.logPval<-100? 0.0 : exp(hit.logPval));
2293       // P-value = 1- exp(-exp(-lamda*(Saa-mu))) => -lamda*(Saa-mu) = log(-log(1-Pvalue))
2294       hit.score_aass = (hit.logPval<-10.0? hit.logPval : log(-log(1-exp(hit.logPval))) ) / 0.45-3.0 - hit.score_ss;
2295       hit.Probab = Probab(hit);
2296       hit.score_sort = hit.logPval;
2297       Overwrite(hit);                                        // copy hit object into current position of hitlist
2298    }
2299
2300   for (k=0; k<N; k++){
2301     delete[](Z[k]); (Z[k]) = NULL;
2302   }
2303   for (k=0; k<N; k++){
2304     delete[](C[k]); (C[k]) = NULL;
2305   }
2306   for (k=0; k<N; k++){
2307     delete[](fold[k]); (fold[k]) = NULL;
2308   }
2309   for (k=0; k<N; k++){
2310     delete[](fam[k]); (fam[k]) = NULL;
2311   }
2312   delete[](C); (C) = NULL;
2313   delete[](Z); (Z) = NULL;
2314   delete[](fold); (fold) = NULL;
2315   delete[](fam); (fam) = NULL;
2316   delete[](Prob); (Prob) = NULL;
2317   delete[](ll); (ll) = NULL;
2318   delete[](Zq); (Zq) = NULL;
2319   delete[](Ztq); (Ztq) = NULL;
2320
2321 }
2322
2323
2324 /////////////////////////////////////////////////////////////////////////////////////
2325 /**
2326  * @brief  Calculate P-values and Probabilities from transitive scoring over whole database
2327  * Best tested scheme. Use fmin(Zq[ll[m]],Z[ll[m]][k])
2328  * and fast approximation for weights (not inverse covariance matrix)
2329  */
2330 void
2331 HitList::TransitiveScoring4()
2332 {
2333   void PrintMatrix(float** V, int N);
2334   void PrintMatrix(double** V, int N);
2335
2336   float** Z;    // matrix of intra-db Z-scores Z_kl
2337   float** C;    // covariance matrix for Z_k: C_kl = sum_m=1^N (Z_km * Z_lm)
2338   char** fold;  // fold name of HMM k
2339   char** fam;   // family of HMM k
2340   float* Prob;  // probability of HMM k
2341   float* Zq;    // Zq[k] = Z-score between query and database HMM k
2342   float* Ztq;   // Ztq[k] = transitive Z-score from query to database HMM k: Ztq[k] = sum_l[ w_ql * Z_lk] / normalization_q
2343   float* Zrq;   // Zrq[k] = transitive Z-score from database HMM k to query: Zrq[k] = sum_l[ w_kl * Z_lq] / normalization_k
2344   float* w;     // unnormalized weight matrix; w[l] is w_ql or w_kl, respectively
2345   int* ll;      // ll[m] is the m'th index l for which Z_lq, Z_lk > Zmin_trans
2346   int N;        // dimension of weight matrix is NxN
2347   int M;        // number of HMMs l with Z_ql>Ztrans_min (or Z_lk>Ztrans_min, respectively)
2348   int k,l,m,n;  // indices for database HMMs
2349   char name[NAMELEN];
2350   Hash<int> index(MAXPROF+7);  // index{name} = index of HMM name in {1,...,N}
2351   index.Null(-1);              // Set int value to return when no data can be retrieved
2352   Hash<int> excluded(13);      // Hash containing names of superfamilies to be excluded from fit
2353   excluded.Null(0);            // Set int value to return when no data can be retrieved
2354   Hit hit;
2355   size_t unused; /* disable fread gcc warning */
2356
2357   // Read weights matrix W with index hash and names array
2358   fprintf(stderr,"Reading in weights file\n");
2359   FILE* wfile = fopen(par.wfile,"rb");
2360   if (v>=1 && wfile==NULL)
2361     {
2362       fprintf(stderr,"Error: %s could not be opened: (N_searched=%i) ",par.wfile,N_searched);
2363       perror("fopen");
2364       fprintf(stderr,"Skipping caclulation of transitive P-values\n");
2365       par.trans=0;
2366       return;
2367     }
2368   unused = fread(&N,sizeof(int),1,wfile);  // read matrix dimension (i.e. number of HMMs in database)
2369   if (v>=1 && N!=N_searched)
2370     {
2371       fprintf(stderr,"Error: Number %i of HMMs in weight file is different from number %i of HMMs in searched databases. \n",N,N_searched);
2372       fprintf(stderr,"Skipping caclulation of transitive P-values\n");
2373       par.trans=0;
2374       return;
2375     }
2376   if (v>=2) fprintf(stderr,"Calculating transitive P-values for %i HMMs\n",N);
2377   // Read names of HMMs (to specify mapping of HMM to matrix indices)
2378   for (k=0; k<N; k++)
2379     {
2380       unused = fread(name,sizeof(char),IDLEN,wfile);
2381       index.Add(name,k);
2382     }
2383   // Read symmetric Z-scores matrix
2384   Z = new(float*[N]);
2385   for (k=0; k<N; k++)
2386     {
2387       Z[k] = new(float[N]);
2388       for (l=0; l<k; l++) Z[k][l] = Z[l][k];
2389       unused = fread(Z[k]+k,sizeof(float),N-k,wfile);
2390     }
2391   // Read symmetric covariance matrix
2392   C = new(float*[N]);
2393   for (k=0; k<N; k++)
2394     {
2395       C[k] = new(float[N]);
2396       for (l=0; l<k; l++) C[k][l] = C[l][k];
2397       unused = fread(C[k]+k,sizeof(float),N-k,wfile);
2398     }
2399   fclose(wfile);
2400
2401   // Allocate memory
2402   Zq = new(float[N]);
2403   Ztq = new(float[N]);
2404   Zrq = new(float[N]);
2405   fold = new(char*[N]);
2406   fam = new(char*[N]);
2407   Prob = new(float[N]);
2408   ll = new(int[N]);
2409   w = new(float[N]);
2410
2411   // Transform P-values to normally distributed Z-scores and store in Zq vector
2412   fprintf(stderr,"Transform P-values to Z-scores\n");
2413   float Zmax_neg   = Score2Z( -log(MINEVALEXCL) + log(N_searched) ); // calculate Z-score corresponding to E-value MINEVALEXCL
2414   float Zmin_trans = Score2Z( -log(par.Emax_trans) + log(N_searched) ); // calculate Z-score corresponding to E-value par.Emax_trans
2415   printf("Zmax = %6.2f   Zmin = %6.2f \n",Zmax_neg,Zmin_trans);
2416
2417   Reset();
2418   while (!End())
2419     {
2420       hit = ReadNext();
2421       if (hit.irep>1) continue;
2422       k = index.Show(hit.name);
2423       if (k<0) {fprintf(stderr,"Error: no index found in weights file for domain %s\n",hit.name); exit(1);}
2424       if (hit.logPvalt<0)
2425         Zq[k] = 0.5*Score2Z(fabs(hit.logPval)) + 0.5*Score2Z(fabs(hit.logPvalt));  // Zq[k] = 0.5*(Zkq + Zqk)
2426       else
2427         Zq[k] = Score2Z(fabs(hit.logPval));                           // Zq[k] = Zqk
2428 //      printf("%4i  %-10.10s logPvalt=%9g  Zq=%9f\n",k,hit.name,hit.logPvalt,Zq[k]);
2429 //      if (isnan(Zq[k])) {
2430 //      fprintf(stderr,"Error: a floating point exception occurred. Skipping transitive scoring\n");
2431 //      printf("%4i  %-10.10s logPval=%9g  logPvalt=%9g  Zq=%9f\n",k,hit.name,hit.logPval,hit.logPvalt,Zq[k]);
2432 //      par.trans=0;
2433 //      return;
2434 //       }
2435       if (Zq[k]>Zmax_neg) excluded.Add(hit.fold);
2436       fold[k] = new(char[IDLEN]);
2437       fam[k] = new(char[IDLEN]);
2438       strcpy(fold[k],hit.fold);
2439       strcpy(fam[k],hit.fam);
2440       weight[k] = hit.weight;
2441       Prob[k] = hit.Probab;
2442    }
2443
2444   if (v>=3)
2445     {
2446       excluded.Reset();
2447       while (!excluded.End())
2448         {
2449           excluded.ReadNext(name);
2450           printf("Excluded fold %s from fitting to Ztq\n",name);
2451         }
2452     }
2453
2454   ////////////////////////////////////////////////////////////////
2455   // Calculate transitive score (query->l) Zt[l]
2456
2457   // Construct vector ll of indices l for which Z_lq > Zmin_trans
2458   m = 0;
2459   for (l=0; l<N; l++)
2460     if (Zq[l]>=Zmin_trans) ll[m++]=l;
2461   M = m;  // number of indices l for which Z_lq,Z_lk > Zmin_trans
2462
2463 //   for (m=0; m<M; m++)
2464 //     fprintf(stderr,"m=%-4i l=%-4i  %-10.10s  Zq[l]=%7f\n",m,ll[m],fam[ll[m]],Zq[ll[m]]);
2465
2466   if (M<=1)
2467     for (k=0; k<N; k++) Ztq[k]=0.0;
2468   else
2469     {
2470       // Generate submatrix of C for indices l for which Z_lq,Z_lk > Zmin_trans
2471       double** Csub = new(double*[M]);
2472       for (m=0; m<M; m++)
2473         {
2474           Csub[m] = new(double[M]);
2475           for (n=0; n<M; n++)
2476             Csub[m][n] = double(C[ll[m]][ll[n]]);
2477         }
2478
2479       if (v>=3)
2480         {
2481           fprintf(stderr,"Covariance matrix\n");
2482           PrintMatrix(Csub,M);
2483         }
2484
2485
2486       // Calculate weights w[l]
2487       for (m=0; m<M; m++)
2488         {
2489           double sum = 0.0;
2490           for (n=0; n<M; n++)
2491             sum += fmax(0.0,Csub[m][n]);
2492           printf("w[%4i] = %-8.5f\n",ll[m],1.0/sum);
2493           w[m] = 1.0/sum;
2494         }
2495
2496       // Calculate Ztq[k] for all HMMs k
2497       fprintf(stderr,"Calculate Ztq vector of transitive Z-scores\n");
2498       float norm = NormalizationFactor(Csub,w,M);
2499       for (k=0; k<N; k++)
2500         {
2501           double sumZ = 0.0;
2502           for (m=0; m<M; m++)
2503             sumZ += w[m] * fmin(Zq[ll[m]],Z[ll[m]][k]);
2504           Ztq[k] = sumZ/norm;
2505         }
2506
2507       for (l=0; l<M; l++){
2508         delete[](Csub[l]); (Csub[l]) = NULL;
2509       }
2510       delete[](Csub); (Csub) = NULL;
2511     }
2512
2513   ////////////////////////////////////////////////////////////////
2514   // Calculate reverse transitive score (l->query-) Zrq[l]
2515
2516   fprintf(stderr,"Calculate Zrq vector of transitive Z-scores\n");
2517   for (k=0; k<N; k++)
2518     {
2519       // Construct vector ll of indices l for which Z_lk > Zmin_tran
2520       m = 0;
2521       for (l=0; l<N; l++)
2522           if (Z[k][l]>=Zmin_trans) ll[m++]=l;
2523       int M = m;  // number of indices l for which Z_lq,Z_lk > Zmin_tran
2524
2525
2526 //    fprintf(stderr,"\nfam[k]: %s\n",fam[k]);
2527 //    for (m=0; m<M; m++)
2528 //    printf(stderr,"m=%-4i k=%-4i  l=%-4i  %-10.10s  Zq[l]=%7f  Z_lk=%7f  \n",m,k,ll[m],fold[ll[m]],Zq[ll[m]],Z[k][ll[m]]);
2529
2530       if (M<=1)
2531         {
2532           Zrq[k] = Zq[k];
2533         }
2534       else
2535         {
2536           // Generate submatrix of C for indices l for which Z_lq,Z_lk > Zmin_trans
2537           double** Csub = new(double*[M]);
2538           for (m=0; m<M; m++)
2539             {
2540               Csub[m] = new(double[M]);
2541               for (n=0; n<M; n++)
2542                 Csub[m][n] = double(C[ll[m]][ll[n]]);
2543             }
2544 //        fprintf(stderr,"Covariance matrix\n");
2545 //        PrintMatrix(Csub,M);
2546
2547           // Calculate weights w[l]
2548           for (m=0; m<M; m++)
2549             {
2550               double sum = 0.0;
2551               for (n=0; n<M; n++)
2552                 sum += fmax(0.0,Csub[m][n]);
2553               w[m] = 1.0/sum;
2554             }
2555
2556 //        for (m=0; m<M; m++) fprintf(stderr,"w[%i]=%8.2g\n",m,w[m]);
2557
2558
2559           // Calculate Zrq[k] and normalize
2560           float norm = NormalizationFactor(Csub,w,M);
2561           double sumZ = 0.0;
2562           for (m=0; m<M; m++)
2563             sumZ += w[m] * fmin(Zq[ll[m]],Z[ll[m]][k]);
2564           Zrq[k] = sumZ/norm;
2565
2566           for (l=0; l<M; l++){
2567             delete[](Csub[l]); (Csub[l]) = NULL;
2568           }
2569           delete[](Csub); (Csub) = NULL;
2570         }
2571
2572 //    fprintf(stderr,"\nZq[k]=%8.2g  Zq1[k]=%8.2g\n",Zq[k],Zrq[k]);
2573     }
2574
2575   // Total Z-score = weighted sum over original Z-score, forward transitive and reverse transitive Z-score
2576   for (k=0; k<N; k++)
2577     {
2578       float Zqtot =  Zq[k] + par.wtrans*(Ztq[k]+Zrq[k]);
2579 //        if (isnan(Zqtot))
2580 //      {
2581 //        fprintf(stderr,"Error: a floating point exception occurred. Skipping transitive scoring\n");
2582 //        printf("%4i  %-10.10s Zq=%6.2f  Ztq=%6.2f  Zrq=%6.2f  Zqtot=%6.2f\n",k,fam[k],Zq[k],Ztq[k],Zrq[k],Zqtot);
2583 //        par.trans=0;
2584 //        return;
2585 //      }
2586       if (v>=3 &&  Zq[k] + Zqtot > 2*Zmin_trans) {
2587         printf("%4i  %-10.10s Zq=%6.2f  Ztq=%6.2f  Zrq=%6.2f  -> Zqtot=%6.2f\n",k,fam[k],Zq[k],Ztq[k],Zrq[k],Zqtot);
2588       }
2589       Ztq[k] = Zqtot;
2590     }
2591
2592   // Calculate mean and standard deviation of Z1q
2593   fprintf(stderr,"Calculate mean and standard deviation of Ztq\n");
2594   double sumw=0.0;
2595   double sumZ=0.0;
2596   double sumZ2=0.0;
2597   for (k=0; k<N; k++)
2598     {
2599       if (excluded.Contains(fold[k])) continue;
2600       sumw  += weight[k];
2601       sumZ  += weight[k]*Ztq[k];
2602       sumZ2 += weight[k]*Ztq[k]*Ztq[k];
2603 //       if (isnan(sumZ))
2604 //      {
2605 //        fprintf(stderr,"Error: a floating point exception occurred. Skipping transitive scoring\n");
2606 //        printf("%4i  %-10.10s Zq=%9f  Zrq=%9f  Ztq=%9f\n",k,fam[k],Zq[k],Zrq[k],Ztq[k]);
2607 //        par.trans=0;
2608 //        return;
2609 //      }
2610     }
2611   float mu = sumZ/sumw;
2612   float sigma = sqrt(sumZ2/sumw-mu*mu);
2613   if (v>=2) printf("mu(Ztq)=%6.3f  sigma(Ztq)=%6.2f\n",mu,sigma);
2614   sigma *= 1.01;// correct different fitting of EVD and normal variables
2615
2616   // Normalize Ztq and calculate P1-values
2617   fprintf(stderr,"Normalize Ztq and calculate P1-values\n");
2618   Reset();
2619   while (!End())
2620     {
2621       hit = ReadNext();
2622       hit.logPval = -Z2Score((Ztq[index.Show(hit.name)]-mu)/sigma);
2623       hit.E1val = N_searched*(hit.logPval<-100? 0.0 : exp(hit.logPval));
2624       // P-value = 1- exp(-exp(-lamda*(Saa-mu))) => -lamda*(Saa-mu) = log(-log(1-Pvalue))
2625       hit.score_aass = (hit.logPval<-10.0? hit.logPval : log(-log(1-exp(hit.logPval))) ) / 0.45-3.0 - hit.score_ss;
2626       hit.Probab = Probab(hit);
2627       hit.score_sort = hit.logPval;
2628       Overwrite(hit);                                        // copy hit object into current position of hitlist
2629    }
2630
2631   for (k=0; k<N; k++){
2632     delete[](Z[k]); (Z[k]) = NULL;
2633   }
2634   for (k=0; k<N; k++){
2635     delete[](C[k]); (C[k]) = NULL;
2636   }
2637   for (k=0; k<N; k++){
2638     delete[](fold[k]); (fold[k]) = NULL;
2639   }
2640   for (k=0; k<N; k++){
2641     delete[](fam[k]); (fam[k]) = NULL;
2642   }
2643   delete[](C); (C) = NULL;
2644   delete[](Z); (Z) = NULL;
2645   delete[](fold); (fold) = NULL;
2646   delete[](fam); (fam) = NULL;
2647   delete[](Prob); (Prob) = NULL;
2648   delete[](ll); (ll) = NULL;
2649   delete[](Zq); (Zq) = NULL;
2650   delete[](Ztq); (Ztq) = NULL;
2651 }
2652
2653
2654 /////////////////////////////////////////////////////////////////////////////////////
2655 /**
2656  * @brief Score2Z transforms the -log(P-value) score into a Z-score for 0 < S
2657  * Score2Z(S) = sqrt(2)*dierfc(2*e^(-S)), where dierfc is the inverse of the complementary error function
2658  */
2659 double
2660 HitList::Score2Z(double S)
2661 {
2662   double s, t, u, w, x, y, z;
2663   if (S<=0) return double(-100000);
2664   y = ( S>200 ? 0.0 : 2.0*exp(-S) );
2665   if (y > 1)
2666     {
2667       z =  (S<1e-6? 2*S : 2-y);
2668       w = 0.916461398268964 - log(z);
2669     }
2670   else
2671     {
2672       z = y;
2673       w = 0.916461398268964 - (0.69314718056-S);
2674     }
2675
2676   u = sqrt(w);
2677   s = (log(u) + 0.488826640273108) / w;
2678   t = 1 / (u + 0.231729200323405);
2679
2680   x = u * (1 - s * (s * 0.124610454613712 + 0.5)) -
2681     ((((-0.0728846765585675 * t + 0.269999308670029) * t +
2682        0.150689047360223) * t + 0.116065025341614) * t +
2683      0.499999303439796) * t;
2684   t = 3.97886080735226 / (x + 3.97886080735226);
2685   u = t - 0.5;
2686   s = (((((((((0.00112648096188977922 * u +
2687         1.05739299623423047e-4) * u - 0.00351287146129100025) * u -
2688         7.71708358954120939e-4) * u + 0.00685649426074558612) * u +
2689         0.00339721910367775861) * u - 0.011274916933250487) * u -
2690         0.0118598117047771104)  * u + 0.0142961988697898018) * u +
2691         0.0346494207789099922)  * u + 0.00220995927012179067;
2692   s = ((((((((((((s * u - 0.0743424357241784861) * u -
2693         0.105872177941595488) * u + 0.0147297938331485121) * u +
2694         0.316847638520135944) * u + 0.713657635868730364) * u +
2695         1.05375024970847138)  * u + 1.21448730779995237) * u +
2696         1.16374581931560831)  * u + 0.956464974744799006) * u +
2697         0.686265948274097816) * u + 0.434397492331430115) * u +
2698         0.244044510593190935) * t -
2699     (z==0? 0: z * exp(x * x - 0.120782237635245222));
2700   x += s * (x * s + 1);
2701   if (y > 1) {
2702     x = -x;
2703   }
2704   return double (1.41421356237*x);
2705 }
2706
2707 /////////////////////////////////////////////////////////////////////////////////////////////////////////
2708 /**
2709  * @brief Z2Score transforms the Z-score into a -log(P-value) value
2710  * Z2Score(Z) = log(2) - log( erfc(Z/sqrt(2)) ) , where derfc is the complementary error function
2711  */
2712 double
2713 HitList::Z2Score(double Z)
2714 {
2715     double t, u, x, y;
2716     x = 0.707106781188*Z;
2717     if (x>10) return 0.69314718056 - (-x*x - log( (1-0.5/x/x)/x/1.772453851) );
2718     t = 3.97886080735226 / (fabs(x) + 3.97886080735226);
2719     u = t - 0.5;
2720     y = (((((((((0.00127109764952614092 * u + 1.19314022838340944e-4) * u -
2721         0.003963850973605135)   * u - 8.70779635317295828e-4) * u +
2722         0.00773672528313526668) * u + 0.00383335126264887303) * u -
2723         0.0127223813782122755)  * u - 0.0133823644533460069) * u +
2724         0.0161315329733252248)  * u + 0.0390976845588484035) * u +
2725         0.00249367200053503304;
2726     y = ((((((((((((y * u - 0.0838864557023001992) * u -
2727         0.119463959964325415) * u + 0.0166207924969367356) * u +
2728         0.357524274449531043) * u + 0.805276408752910567) * u +
2729         1.18902982909273333)  * u + 1.37040217682338167) * u +
2730         1.31314653831023098)  * u + 1.07925515155856677) * u +
2731         0.774368199119538609) * u + 0.490165080585318424) * u +
2732         0.275374741597376782) * t * (x>10? 0.0 : exp(-x * x));
2733     return 0.69314718056 - log( x < 0 ? 2 - y : y );
2734 }
2735
2736
2737 /////////////////////////////////////////////////////////////////////////////////////////////////////////
2738 /**
2739  * @brief
2740  */
2741 void
2742 PrintMatrix(float** V, int N)
2743 {
2744   int k,l;
2745   for (k=0; k<N; k++)
2746     {
2747       fprintf(stderr,"k=%4i \n",k);
2748       for (l=0; l<N; l++)
2749         {
2750           fprintf(stderr,"%4i:%6.3f ",l,V[k][l]);
2751           if ((l+1)%10==0) fprintf(stderr,"\n");
2752         }
2753       fprintf(stderr,"\n");
2754     }
2755   fprintf(stderr,"\n");
2756 }
2757
2758 /////////////////////////////////////////////////////////////////////////////////////////////////////////
2759 /**
2760  * @brief
2761  */
2762 void
2763 PrintMatrix(double** V, int N)
2764 {
2765   int k,l;
2766   for (k=0; k<N; k++)
2767     {
2768       fprintf(stderr,"k=%4i \n",k);
2769       for (l=0; l<N; l++)
2770         {
2771           fprintf(stderr,"%4i:%6.3f ",l,V[k][l]);
2772           if ((l+1)%10==0) fprintf(stderr,"\n");
2773         }
2774       fprintf(stderr,"\n");
2775     }
2776   fprintf(stderr,"\n");
2777 }
2778
2779 /////////////////////////////////////////////////////////////////////////////////////////////////////////
2780 /**
2781  * @brief
2782  */
2783 void
2784 HitList::Normalize(float* Ztq, char** fold, Hash<int>& excluded)
2785 {
2786   double sumw=0.0;
2787   double sumZ=0.0;
2788   double sumZ2=0.0;
2789   for (int k=0; k<N_searched; k++)
2790     {
2791       if (excluded.Contains(fold[k])) continue;
2792       sumw  += weight[k];
2793       sumZ  += weight[k]*Ztq[k];
2794       sumZ2 += weight[k]*Ztq[k]*Ztq[k];
2795     }
2796   float mu = sumZ/sumw;
2797   float sigma = sqrt(sumZ2/sumw-mu*mu);
2798   printf("Transitive score Ztq: mu=%8.3g  sigma=%8.3g\n",mu,sigma);
2799   for (int k=0; k<N_searched; k++) Ztq[k] = (Ztq[k]-mu)/sigma;
2800   return;
2801 }
2802
2803 /////////////////////////////////////////////////////////////////////////////////////////////////////////
2804 /**
2805  * @brief Calculate standard deviation of Z1 = sum_m [ w_m * Z_m ], where Csub_mn = cov(Z_m,Z_n)
2806  */
2807 float
2808 HitList::NormalizationFactor(double** Csub, float* w, int M)
2809   {
2810     double sum=0.0;
2811     for (int m=0; m<M; m++)
2812       {
2813         double summ=0.0;
2814         for (int n=0; n<M; n++) summ += Csub[m][n]*w[n];
2815         sum += w[m]*summ;
2816       }
2817     return sqrt(sum);
2818   }
2819
2820 /////////////////////////////////////////////////////////////////////////////////////////////////////////
2821 /**
2822  * @brief Calculate inverse of matrix A and store result in B
2823  */
2824 void
2825 HitList::InvertMatrix(double** B, double** A, int N)
2826 {
2827   if (N==0)
2828     {
2829       printf("Error: InvertMatrix called with matrix of dimension 0\n");
2830       exit(6);
2831     }
2832   if (N==1)
2833     {
2834       B[0][0] = (A[0][0]==0.0? 0 :1.0/A[0][0]);
2835       return;
2836     }
2837
2838   int k,l,m;
2839   double** V = new(double*[N]);
2840   double* s  = new(double[N]);
2841   for (k=0; k<N; k++) V[k] = new(double[N]);
2842
2843   // Copy original matrix A into B since B will be overwritten by SVD()
2844   for (k=0; k<N; k++)
2845     for (l=0; l<N; l++)
2846       B[k][l] = A[k][l];
2847
2848   SVD(B, N, s, V);  // U replaces B on output; s[] contains singluar values
2849
2850   // Calculate inverse of A: A^-1 = V * diag(1/s) * U^t
2851   double** U = B;
2852   // Calculate V[k][m] -> V[k][m] *diag(1/s)
2853   for (k=0; k<N; k++)
2854     for (m=0; m<N; m++)
2855       if (s[m]!=0.0) V[k][m] /= s[m]; else V[k][m] = 0.0;
2856   // Calculate V[k][l] -> (V * U^t)_kl
2857   for (k=0; k<N; k++)
2858     {
2859       if (v>=4 && k%100==0) printf("%i\n",k);
2860       for (l=0; l<N; l++)
2861         {
2862           s[l] = 0.0; // use s[] as temporary memory to avoid overwriting B[k][] as long as it is needed
2863           for (m=0; m<N; m++)
2864             s[l] += V[k][m]*U[l][m];
2865         }
2866       for (l=0; l<N; l++) V[k][l]=s[l];
2867     }
2868   for (k=0; k<N; k++)
2869     for (l=0; l<N; l++)
2870       B[k][l] = V[k][l];
2871
2872   for (k=0; k<N; k++){
2873     delete[](V[k]); (V[k]) = NULL;
2874   }
2875   delete[](V); (V) = NULL;
2876   return;
2877 }
2878
2879
2880 /////////////////////////////////////////////////////////////////////////////////////////////////////////
2881 /**
2882  * @brief
2883  */
2884 void
2885 HitList::TransposeMatrix(double** V, int N)
2886 {
2887   int k,l;
2888   for (k=0; k<N; k++) // transpose Z for efficiency of ensuing matrix multiplication
2889     for (l=0; l<k; l++)
2890       {
2891         double buf = V[k][l];
2892         V[k][l] = V[l][k];
2893         V[l][k] = buf;
2894       }
2895 }
2896
2897 /////////////////////////////////////////////////////////////////////////////////////////////////////////
2898 static double sqrarg;
2899 #define SQR(a) ((sqrarg=(a)) == 0.0 ? 0.0 : sqrarg*sqrarg)
2900 static double maxarg1,maxarg2;
2901 #define FMAX(a,b) (maxarg1=(a),maxarg2=(b),(maxarg1) > (maxarg2) ? (maxarg1) : (maxarg2))
2902 static int iminarg1,iminarg2;
2903 #define IMIN(a,b) (iminarg1=(a),iminarg2=(b),(iminarg1) < (iminarg2) ? (iminarg1) : (iminarg2))
2904 #define SIGN(a,b) ((b) >= 0.0 ? fabs(a) : -fabs(a))
2905
2906 /**
2907  * @brief This is a version of the Golub and Reinsch algorithm for singular value decomposition for a quadratic
2908  * (n x n) matrix A. It is sped up by transposing A amd V matrices at various places in the algorithm.
2909  * On a 400x400 matrix it runs in 1.6 s or 2.3 times faster than the original (n x m) version.
2910  * On a 4993x4993 matrix it runs in 2h03 or 4.5 times faster than the original (n x m) version.
2911  *
2912  * Given a matrix a[0..n-1][0..n-1], this routine computes its singular value decomposition, A = U · W · V^t .
2913  * The matrix U replaces a on output. The diagonal matrix of singular values W is out-put as a vector w[0..n-1].
2914  * The matrix V (not the transpose V^t) is output as V[0..n-1][0..n-1] ./
2915  */
2916 void
2917 HitList::SVD(double **A, int n, double w[], double **V)
2918 {
2919   int m=n; // in general algorithm A is an (m x n) matrix instead of (n x n)
2920
2921   double pythag(double a, double b);
2922   int flag,i,its,j,jj,k,l=1,nm=1;
2923   double anorm,c,f,g,h,s,scale,x,y,z,*rv1;
2924   rv1=new(double[n]);
2925   g=scale=anorm=0.0;
2926
2927   // Householder reduction to bidiagonal form.
2928   if (v>=5) printf("\nHouseholder reduction to bidiagonal form\n");
2929   for (i=0;i<n;i++) {
2930     if (v>=4 && i%100==0) printf("i=%i\n",i);
2931     if (v>=4) fprintf(stderr,".");
2932     l=i+1;
2933     rv1[i]=scale*g;
2934     g=s=scale=0.0;
2935     if (i < m) {
2936       for (k=i;k<m;k++) scale += fabs(A[k][i]);
2937       if (scale) {
2938         for (k=i;k<m;k++) {
2939           A[k][i] /= scale;
2940           s += A[k][i]*A[k][i];
2941         }
2942         f=A[i][i];
2943         g = -SIGN(sqrt(s),f);
2944         h=f*g-s;
2945         A[i][i]=f-g;
2946         for (j=l;j<n;j++) {
2947           for (s=0.0,k=i;k<m;k++) s += A[k][i]*A[k][j];
2948           f=s/h;
2949           for (k=i;k<m;k++) A[k][j] += f*A[k][i];
2950         }
2951         for (k=i;k<m;k++) A[k][i] *= scale;
2952       }
2953     }
2954     w[i]=scale *g;
2955     g=s=scale=0.0;
2956     if (i < m && i != n-1) {
2957       for (k=l;k<n;k++) scale += fabs(A[i][k]);
2958       if (scale) {
2959         for (k=l;k<n;k++) {
2960           A[i][k] /= scale;
2961           s += A[i][k]*A[i][k];
2962         }
2963         f=A[i][l];
2964         g = -SIGN(sqrt(s),f);
2965         h=f*g-s;
2966         A[i][l]=f-g;
2967         for (k=l;k<n;k++) rv1[k]=A[i][k]/h;
2968         for (j=l;j<m;j++) {
2969           for (s=0.0,k=l;k<n;k++) s += A[j][k]*A[i][k];
2970           for (k=l;k<n;k++) A[j][k] += s*rv1[k];
2971         }
2972         for (k=l;k<n;k++) A[i][k] *= scale;
2973       }
2974     }
2975     anorm=FMAX(anorm,(fabs(w[i])+fabs(rv1[i])));
2976   }
2977   // Accumulation of right-hand transformations.
2978   if (v>=5) printf("\nAccumulation of right-hand transformations\n");
2979   TransposeMatrix(V,n);
2980   for (i=n-1;i>=0;i--) {
2981     if (v>=4 && i%100==0) printf("i=%i\n",i);
2982     if (v>=4) fprintf(stderr,".");
2983     if (i < n-1) {
2984       if (g) {
2985         // Double division to avoid possible underflow.
2986         for (j=l;j<n;j++)
2987           V[i][j]=(A[i][j]/A[i][l])/g;
2988         for (j=l;j<n;j++) {
2989           for (s=0.0,k=l;k<n;k++) s += A[i][k]*V[j][k];
2990           for (k=l;k<n;k++) V[j][k] += s*V[i][k];
2991         }
2992       }
2993       for (j=l;j<n;j++) V[j][i]=V[i][j]=0.0;
2994     }
2995     V[i][i]=1.0;
2996     g=rv1[i];
2997     l=i;
2998   }
2999   // Accumulation of left-hand transformations.
3000   if (v>=5) printf("\nAccumulation of left-hand transformations\n");
3001   TransposeMatrix(A,n);
3002   for (i=IMIN(m,n)-1;i>=0;i--) {
3003     if (v>=4 && i%100==0) printf("i=%i\n",i);
3004     if (v>=4) fprintf(stderr,".");
3005     l=i+1;
3006     g=w[i];
3007     for (j=l;j<n;j++) A[j][i]=0.0;
3008     if (g) {
3009       g=1.0/g;
3010       for (j=l;j<n;j++) {
3011         for (s=0.0,k=l;k<m;k++) s += A[i][k]*A[j][k];
3012         f=(s/A[i][i])*g;
3013         for (k=i;k<m;k++) A[j][k] += f*A[i][k];
3014       }
3015       for (j=i;j<m;j++) A[i][j] *= g;
3016     } else for (j=i;j<m;j++) A[i][j]=0.0;
3017     ++A[i][i];
3018   }
3019
3020   // Diagonalization of the bidiagonal form: Loop over singular values, and over allowed iterations.
3021   if (v>=5) printf("\nDiagonalization of the bidiagonal form\n");
3022   for (k=n-1;k>=0;k--) {
3023     if (v>=4 && k%100==0) printf("k=%i\n",k);
3024     if (v>=4) fprintf(stderr,".");
3025     for (its=1;its<=30;its++) {
3026       flag=1;
3027       // Test for splitting. Note that rv1[1] is always zero.
3028       for (l=k;l>=0;l--) {
3029         nm=l-1;
3030         if ((double)(fabs(rv1[l])+anorm) == anorm) {
3031           flag=0;
3032           break;
3033         }
3034         if ((double)(fabs(w[nm])+anorm) == anorm) break;
3035       }
3036       if (flag) {
3037         // Cancellation of rv1[l], if l > 1.
3038         c=0.0;
3039         s=1.0;
3040         for (i=l;i<=k;i++) {
3041           f=s*rv1[i];
3042           rv1[i]=c*rv1[i];
3043           if ((double)(fabs(f)+anorm) == anorm) break;
3044           g=w[i];
3045           h=pythag(f,g);
3046           w[i]=h;
3047           h=1.0/h;
3048           c=g*h;
3049           s = -f*h;
3050           for (j=0;j<m;j++) {
3051             y=A[nm][j];
3052             z=A[i][j];
3053             A[nm][j]=y*c+z*s;
3054             A[i][j]=z*c-y*s;
3055           }
3056         }
3057       }
3058       z=w[k];
3059       // Convergence.
3060       if (l == k) {
3061         // Singular value is made nonnegative.
3062         if (z < 0.0) {
3063           w[k] = -z;
3064           for (j=0;j<n;j++) V[k][j] = -V[k][j];
3065         }
3066         break;
3067       }
3068       if (its == 30) {printf("Error in SVD: no convergence in 30 iterations\n"); exit(7);}
3069       // Shift from bottom 2-by-2 minor.
3070       x=w[l];
3071       nm=k-1;
3072       y=w[nm];
3073       g=rv1[nm];
3074       h=rv1[k];
3075       f=((y-z)*(y+z)+(g-h)*(g+h))/(2.0*h*y);
3076       g=pythag(f,1.0);
3077       f=((x-z)*(x+z)+h*((y/(f+SIGN(g,f)))-h))/x;
3078       // Next QR transformation:
3079       c=s=1.0;
3080       for (j=l;j<=nm;j++) {
3081         i=j+1;
3082         g=rv1[i];
3083         y=w[i];
3084         h=s*g;
3085         g=c*g;
3086         z=pythag(f,h);
3087         rv1[j]=z;
3088         c=f/z;
3089         s=h/z;
3090         f=x*c+g*s;
3091         g = g*c-x*s;
3092         h=y*s;
3093         y *= c;
3094         for (jj=0;jj<n;jj++) {
3095           x=V[j][jj];
3096           z=V[i][jj];
3097           V[j][jj]=x*c+z*s;
3098           V[i][jj]=z*c-x*s;
3099         }
3100         z=pythag(f,h);
3101         // Rotation can be arbitrary if z = 0.
3102         w[j]=z;
3103         if (z) {
3104           z=1.0/z;
3105           c=f*z;
3106           s=h*z;
3107         }
3108         f=c*g+s*y;
3109         x=c*y-s*g;
3110
3111         for (jj=0;jj<m;jj++) {
3112           y=A[j][jj];
3113           z=A[i][jj];
3114           A[j][jj]=y*c+z*s;
3115           A[i][jj]=z*c-y*s;
3116         }
3117       }
3118       rv1[l]=0.0;
3119       rv1[k]=f;
3120       w[k]=x;
3121     }
3122   }
3123   TransposeMatrix(V,n);
3124   TransposeMatrix(A,n);
3125   delete[](rv1); (rv1) = NULL;
3126 }
3127
3128 /**
3129  * @brief Computes (a2 + b2 )^1/2 without destructive underflow or overflow.
3130  */
3131 double
3132 pythag(double a, double b)
3133 {
3134   double absa,absb;
3135   absa=fabs(a);
3136   absb=fabs(b);
3137   if (absa > absb)
3138     return absa*sqrt(1.0+SQR(absb/absa));
3139   else
3140     return (absb == 0.0 ? 0.0 : absb*sqrt(1.0+SQR(absa/absb)));
3141 }
3142
3143
3144 /* @* HitList::ClobberGlobal(void)
3145  */
3146 void
3147 HitList::ClobberGlobal(void){
3148
3149
3150   /* @<variables local to HitList::ClobberGlobal@> */
3151   class List<Hit>::ListEl<Hit>  *pvIter = head;
3152
3153   /* NOTE: no free/delete-ing of data to be done here
3154      hitlist only holds a shallow copy of hit;
3155      hit is being cleared off properly.
3156      just reset everything to 0/0.0/NULL.
3157      The only important thing to do at this stage
3158      is to attach head and tail and set size = 0
3159      (FS, 2010-02-18)
3160
3161      NOTE: I only ever saw 1 (one) in-between element,
3162      but there may ctually be a real linked list
3163      of more than 1 element (FS, 2010-02-18)
3164   */
3165
3166   //  printf("POINTER:\t%p\t=HEAD\n", head);
3167   while (pvIter->next != tail){
3168
3169     //    printf("POINTER:\t%p->\t%p\n", pvIter, pvIter->next);
3170     pvIter = pvIter->next;
3171
3172 #if 1
3173     pvIter->data.longname = pvIter->data.name =
3174       pvIter->data.file = pvIter->data.dbfile = NULL;
3175     pvIter->data.sname = NULL;
3176     pvIter->data.seq = NULL;
3177     pvIter->data.self = 0;
3178     pvIter->data.i = pvIter->data.j = NULL;
3179     pvIter->data.states = NULL;
3180     pvIter->data.S = pvIter->data.S_ss = pvIter->data.P_posterior = NULL;
3181     pvIter->data.Xcons = NULL;
3182     pvIter->data.sum_of_probs = 0.0;
3183     pvIter->data.Neff_HMM = 0.0;
3184     pvIter->data.score_ss = pvIter->data.Pval = pvIter->data.logPval =
3185       pvIter->data.Eval = pvIter->data.Probab = pvIter->data.Pforward = 0.0;
3186     pvIter->data.nss_conf = pvIter->data.nfirst =
3187       pvIter->data.i1 = pvIter->data.i2 = pvIter->data.j1 = pvIter->data.j2 =
3188       pvIter->data.matched_cols = pvIter->data.ssm1 = pvIter->data.ssm2 = 0;
3189 #endif
3190   }
3191   //  printf("POINTER:\t\t\t%p=TAIL\n", tail);
3192
3193
3194   head->next = tail;
3195   tail->prev = head;
3196   size = 0;
3197
3198   /* @= */
3199   return;
3200
3201 } /* this is the end of HitList::ClobberGlobal() */
3202
3203
3204 /*
3205  * EOF hhhitlist-C.h
3206  */