1 /* -*- mode: c; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
3 /*********************************************************************
4 * Clustal Omega - Multiple sequence alignment
6 * Copyright (C) 2010 University College Dublin
8 * Clustal-Omega is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License as
10 * published by the Free Software Foundation; either version 2 of the
11 * License, or (at your option) any later version.
13 * This file is part of Clustal-Omega.
15 ********************************************************************/
18 * RCS $Id: hhutil-C.h 246 2011-06-15 12:41:04Z fabian $
21 //////////////////////////////////////////////////////////////////////////////
22 // Transform a character to lower case and '.' to '-' and vice versa
23 //////////////////////////////////////////////////////////////////////////////
27 MatchChr(char c) {return ((c>='a' && c<='z')? c-'a'+'A' : (c=='.'? '-':c) );}
30 InsertChr(char c) {return ((c>='A' && c<='Z')? c+'a'-'A' : ((c>='0' && c<='9') || c=='-')? '.':c );}
33 WordChr(char c) {return (int)((c>='A' && c<='Z') || (c>='a' && c<='z'));}
36 //////////////////////////////////////////////////////////////////////////////
38 * @brief Transforms the one-letter amino acid code into an integer between 0 and 22
43 //A R N D C Q E G H I L K M F P S T W Y V
44 if (c>='a' && c<='z') c+='A'-'a';
70 case 'U': return 4; //Selenocystein -> Cystein
71 case 'B': return 3; //D (or N)
72 case 'Z': return 6; //E (or Q)
77 if (c>=0 && c<=32) return -1; // white space and control characters
81 ///////////////////////////////////////////////////////////////////////////////
83 * @brief Transforms integers between 0 and 22 into the one-letter amino acid code
88 //A R N D C Q E G H I L K M F P S T W Y V
111 case ANY: return 'X';
112 case GAP: return '-';
113 case ENDGAP: return '-';
118 //////////////////////////////////////////////////////////////////////////////
120 * @brief Transforms the dssp/psipred secondary structure code into an integer number
126 if (c>='a' && c<='z') c+='A'-'a';
142 case '\t': return -1;
143 case '\n': return -1;
148 //////////////////////////////////////////////////////////////////////////////
150 * @brief Transforms integers between 0 and 8 into the dssp/psipred secondary structure code
172 //////////////////////////////////////////////////////////////////////////////
174 * @brief Transforms the solvend accessiblity code into an integer number
180 if (c>='a' && c<='z') c+='A'-'a';
192 case '\t': return -1;
193 case '\n': return -1;
198 //////////////////////////////////////////////////////////////////////////////
200 * @brief Transforms integers between 0 and 5 into the solvent accessibility code
202 inline char i2sa(int c)
219 //////////////////////////////////////////////////////////////////////////////
221 * @brief Transforms alternative secondary structure symbols into symbols
229 case '~': return 'C';
230 case 'I': return 'C';
231 case 'i': return 'c';
252 //////////////////////////////////////////////////////////////////////////////
254 * @brief Transforms confidence values of psipred into internal code
277 //////////////////////////////////////////////////////////////////////////////
279 * @brief Transforms internal representation of psipred confidence values into printable chars
302 //////////////////////////////////////////////////////////////////////////////
304 * @brief Fast lookup of log2(1+2^(-x)) for x>=0 (precision < 0.35%)
307 fast_addscore(float x)
309 static float val[2001]; // val[i]=log2(1+2^(-x))
310 static char initialized;
311 if (x>20) return 0.0;
314 fprintf(stderr,"Error in function fast_addscore: argument %g is negative\n",x);
317 if (!initialized) //First fill in the log2-vector
319 for (int i=0; i<=2000; i++) val[i]=log2(1.0+pow(2,-0.01*(i+0.5)));
322 return val[(int)(100.0*x)];
327 //////////////////////////////////////////////////////////////////////////////
329 * @brief Little utilities for output
332 fout(FILE* outf, int d)
334 if (d>=99999) fprintf(outf,"*\t"); else fprintf(outf,"%i\t",d);
338 //////////////////////////////////////////////////////////////////////////////
343 FormatError(const char infile[], const char details[]="")
345 cerr<<"Error in "<</*par.argv[0],FS*/__FILE__<<": wrong format while reading file \'"<<infile<<". "<<details<<"\n";
350 OpenFileError(const char outfile[])
352 cerr<<endl<<"Error in "<</*par.argv[0],FS*/__FILE__<<": could not open file \'"<<outfile<<"\'\n";
357 MemoryError(const char arrayname[])
359 cerr<<"Error in "<</*par.argv[0],FS*/__FILE__<<": Memory overflow while creating \'"<<arrayname<<"\'. Please report this bug to developers\n";
364 InternalError(const char errstr[])
366 cerr<<"Error in "<</*par.argv[0],FS*/__FILE__<<": "<<errstr<<". Please report this bug to developers\n";
371 //////////////////////////////////////////////////////////////////////////////
373 * @brief Takes family code (eg. a.1.2.3) and returns strings 'a', 'a.1', and 'a.1.2'
376 ScopID(char cl[], char fold[], char sfam[], const char fam[])
382 ptr = strchr(cl,'.'); //return adress of next '.' in name
387 ptr = strchr(fold,'.'); //return adress of next '.' in name
388 if(ptr) ptr = strchr(ptr+1,'.'); //return adress of next '.' in name
391 //get scop superfamily ID
393 ptr = strchr(sfam,'.'); //return adress of next '.' in name
394 if(ptr) ptr = strchr(ptr+1,'.'); //return adress of next '.' in name
395 if(ptr) ptr = strchr(ptr+1,'.'); //return adress of next '.' in name
400 //////////////////////////////////////////////////////////////////////////////
402 * @brief Read up to n lines of outfile and write to screen (STDERR)
405 WriteToScreen(char* outfile, int n)
407 char line[LINELEN]="";
409 outf.open(outfile, ios::in);
410 if (!outf) {OpenFileError(outfile);}
412 for(; n>0 && outf.getline(line,LINELEN); n--) cout<<line<<"\n";
418 WriteToScreen(char* outfile) {WriteToScreen(outfile,INT_MAX);}
422 /////////////////////////////////////////////////////////////////////////////////////
424 * @brief Read .hhdefaults file into array argv_conf (beginning at argv_conf[1])
427 ReadDefaultsFile(int& argc_conf, char** argv_conf)
429 char line[LINELEN]="";
430 char filename[NAMELEN];
431 char* c_first; //pointer to first character of argument string
432 char* c; //pointer to scan line read in for end of argument
435 argc_conf=1; //counts number of arguments read in
438 strcpy(filename,"./.hhdefaults");
439 configf = fopen(filename,"r");
440 if (!configf && getenv("HOME"))
442 strcpy(filename,getenv("HOME"));
443 strcat(filename,"/.hhdefaults");
444 configf = fopen(filename,"r");
447 if (v>=3) cerr<<"Warning: could not find ./.hhdefaults or "<<filename<<"\n";
451 else if (!configf) return; // only webserver has no home directory => need no warning
453 // Scan file until line 'program_nameANYTHING'
454 while (fgets(line,LINELEN,configf))
455 if (!strncmp(line,program_name,6)) break;
456 // Found line 'program_nameANYTHING'?
457 if (!strncmp(line,program_name,6))
459 // Read in options until end-of-file or empty line
460 while (fgets(line,LINELEN,configf) && strcmp(line,"\n"))
467 while (*c==' ' || *c=='\t') c++; //Advance until next non-white space
468 if (*c=='\0' || *c=='\n' || *c=='#') break; //Is next word empty string?
470 while (*c!=' ' && *c!='\t' && *c!='#' && *c!='\0' && *c!='\n' ) c++; //Advance until next white space or '#'
471 if (*c=='\0' || *c=='\n' || *c=='#') //Is end of line reached?
474 argv_conf[argc_conf]=new(char[strlen(c_first)+1]);
475 strcpy(argv_conf[argc_conf++],c_first);
479 argv_conf[argc_conf]=new(char[strlen(c_first)+1]);
480 strcpy(argv_conf[argc_conf++],c_first);
481 printf("Argument: %s\n",c_first);
487 cout<<"Arguments read in from .hhdefaults:";
488 for (int argc=1; argc<argc_conf; argc++) cout<<(argv_conf[argc][0]=='-'? " ":"")<<argv_conf[argc]<<" ";
491 else if (v>=3) cout<<"Read in "<<argc_conf<<" default arguments for "<<program_name<<" from "<<filename<<"\n";
493 else //found no line 'program_name anything"
495 if (v>=3) cerr<<endl<<"Warning: no default options for \'"<<program_name<<"\' found in "<<filename<<"\n";
496 return; //no line 'program_name anything' found
503 /////////////////////////////////////////////////////////////////////////////////////
505 * @brief Set default parameter values
511 par.append=0; // overwrite output file
512 par.outformat=0; // 0: hhr 1: FASTA 2:A2M 3:A3M
513 par.p=20.0f; // minimum threshold for inclusion in hit list and alignment listing
514 par.E=1e6f; // maximum threshold for inclusion in hit list and alignment listing
515 par.b=10; // min number of alignments
516 par.B=500; // max number of alignments
517 par.z=10; // min number of lines in hit list
518 par.Z=500; // max number of lines in hit list
519 par.e=1e-3f; // maximum E-value for inclusion in output alignment, output HMM, and PSI-BLAST checkpoint model
520 par.showcons=1; // show consensus sequence
521 par.showdssp=1; // show predicted secondary structure
522 par.showpred=1; // show dssp secondary structure
523 par.cons=0; // show first non-SS sequence as main representative sequence (not consensus)
524 par.nseqdis=1; // maximum number of query sequences for output alignment
525 par.mark=0; // 1: only marked sequences (or first) get displayed; 0: most divergent ones get displayed
526 par.aliwidth=80; // number of characters per line in output alignments for HMM search
527 par.max_seqid=90; // default for maximum sequence identity threshold
528 par.qid=0; // default for minimum sequence identity with query
529 par.qsc=-20.0f; // default for minimum score per column with query
530 par.coverage=0; // default for minimum coverage threshold
531 par.Ndiff=100; // pick Ndiff most different sequences from alignment
532 par.coverage_core=80; // Minimum coverage for sequences in core alignment
533 par.qsc_core=0.3f; // Minimum score per column of core sequence with query
534 par.coresc=-20.0f; // Minimum score per column with core alignment (HMM)
536 par.M=1; // match state assignment is by A2M/A3M
537 par.Mgaps=50; // Above this percentage of gaps, columns are assigned to insert states (for par.M=2)
538 par.calibrate=0; // default: no calibration
539 par.calm=0; // derive P-values from: 0:query calibration 1:template calibration 2:both
542 par.wg=0; // 0: use local sequence weights 1: use local ones
544 par.matrix=0; // Subst.matrix 0: Gonnet, 1: HSDM, 2: BLOSUM50 3: BLOSUM62
545 par.pcm=2; // pseudocount mode: default=divergence-dependent (but not column-specific)
546 #if 1 /* Nelder-Meade on Baliscore */
547 par.pca=1.712190f; // default values for substitution matrix pseudocounts
548 par.pcb=1.039640f; // significant reduction of pcs by Neff_M starts around Neff_M-1=pcb
549 par.pcc=0.878067f; // pcs are reduced prop. to 1/Neff^pcc
550 par.pcw=0.0f; // wc>0 weighs columns according to their intra-clomun similarity
552 par.gapb=1.405220; // default values for transition pseudocounts
553 par.gapd=1.316760; // gap open penalty pseudocount; 0.25 corresponds to 7.1*gapf bits
554 par.gape=1.793780; // gap extension penalty pseudocount
555 par.gapf=1.034710; // factor for increasing gap open penalty for deletes
556 par.gapg=0.894772; // factor for increasing gap open penalty for inserts
557 par.gaph=0.544072; // factor for increasing gap extension penalty for deletes
558 par.gapi=0.862559; // factor for increasing gap extension penalty for inserts
559 #else /* Soeding's default*/
560 par.pca=1.0f; // default values for substitution matrix pseudocounts
561 par.pcb=1.5f; // significant reduction of pcs by Neff_M starts around Neff_M-1=pcb
562 par.pcc=1.0f; // pcs are reduced prop. to 1/Neff^pcc
563 par.pcw=0.0f; // wc>0 weighs columns according to their intra-clomun similarity
565 par.gapb=1.0; // default values for transition pseudocounts
566 par.gapd=0.15; // gap open penalty pseudocount; 0.25 corresponds to 7.1*gapf bits
567 par.gape=1.0; // gap extension penalty pseudocount
568 par.gapf=0.6; // factor for increasing gap open penalty for deletes
569 par.gapg=0.6; // factor for increasing gap open penalty for inserts
570 par.gaph=0.6; // factor for increasing gap extension penalty for deletes
571 par.gapi=0.6; // factor for increasing gap extension penalty for inserts
575 /* Viterbi parameters optimised on Sabre (R228), FS, r228 -> r229 */
576 par.pcaV=1.245150f; // default values for substitution matrix pseudocounts
577 par.pcbV=1.682110f; // significant reduction of pcs by Neff_M starts around Neff_M-1=pcb
578 par.pccV=1.483840f; // pcs are reduced prop. to 1/Neff^pcc
579 par.pcwV=0.0f; // wc>0 weighs columns according to their intra-clomun similarity
581 par.gapbV=0.818625; // default values for transition pseudocounts
582 par.gapdV=0.666110; // gap open penalty pseudocount; 0.25 corresponds to 7.1*gapf bits
583 par.gapeV=1.028050; // gap extension penalty pseudocount
584 par.gapfV=0.710760; // factor for increasing gap open penalty for deletes
585 par.gapgV=1.649800; // factor for increasing gap open penalty for inserts
586 par.gaphV=0.470604; // factor for increasing gap extension penalty for deletes
587 par.gapiV=0.829479; // factor for increasing gap extension penalty for inserts
589 /* Viterbi parameters optimised on Balibase, r244 -> r245 */
590 par.pcaV=1.333860f; // default values for substitution matrix pseudocounts
591 par.pcbV=1.934480f; // significant reduction of pcs by Neff_M starts around Neff_M-1=pcb
592 par.pccV=1.655610f; // pcs are reduced prop. to 1/Neff^pcc
593 par.pcwV=0.0f; // wc>0 weighs columns according to their intra-clomun similarity
595 par.gapbV=0.334525; // default values for transition pseudocounts
596 par.gapdV=0.074534; // gap open penalty pseudocount; 0.25 corresponds to 7.1*gapf bits
597 par.gapeV=0.320336; // gap extension penalty pseudocount
598 par.gapfV=0.151634; // factor for increasing gap open penalty for deletes
599 par.gapgV=0.641516; // factor for increasing gap open penalty for inserts
600 par.gaphV=0.266434; // factor for increasing gap extension penalty for deletes
601 par.gapiV=0.598414; // factor for increasing gap extension penalty for inserts
602 #else /* Soeding default*/
603 par.pcaV=1.0f; // default values for substitution matrix pseudocounts
604 par.pcbV=1.5f; // significant reduction of pcs by Neff_M starts around Neff_M-1=pcb
605 par.pccV=1.0f; // pcs are reduced prop. to 1/Neff^pcc
606 par.pcwV=0.0f; // wc>0 weighs columns according to their intra-clomun similarity
608 par.gapbV=1.0; // default values for transition pseudocounts
609 par.gapdV=0.15; // gap open penalty pseudocount; 0.25 corresponds to 7.1*gapf bits
610 par.gapeV=1.0; // gap extension penalty pseudocount
611 par.gapfV=0.6; // factor for increasing gap open penalty for deletes
612 par.gapgV=0.6; // factor for increasing gap open penalty for inserts
613 par.gaphV=0.6; // factor for increasing gap extension penalty for deletes
614 par.gapiV=0.6; // factor for increasing gap extension penalty for inserts
617 par.ssm=2; // ss scoring mode: 0:no ss score 1:score after alignment 2:score during alignment
618 par.ssw=0.11f; // weight of ss scoring
619 par.ssa=1.0f; // weight of ss evolution matrix
620 par.shift=-0.01f; // Shift match score up
621 par.mact=0.3001f; // Score threshold for MAC alignment in local mode (set to 0.5001 to track user modification)
622 par.corr=0.1f; // Weight of correlations of scores for |i-j|<=4
623 par.wstruc=1.0f; // Weight of structure scores
625 par.egq=0.0f; // no charge for end gaps as default
626 par.egt=0.0f; // no charge for end gaps as default
628 par.trans=0; // no transitive scoring as default
629 par.Emax_trans=100.0f; // use intermediate HMMs with E-values up to 100 between query and database HMM
630 par.Emax_trans=100.0f; // use intermediate HMMs with E-values up to 100 between query and database HMM
631 par.wtrans=1.0f; // Ztot[k] = Zq[k] + wtrans * (Zforward[k]+Zreverse[k])
632 par.ssgap=0; // 1: add secondary structure-dependent gap penalties 0:off
633 par.ssgapd=1.0f; // secondary structure-dependent gap-opening penalty (per residue)
634 par.ssgape=0.0f; // secondary structure-dependent gap-extension penalty (per residue)
635 par.ssgapi=4; // max. number of inside-integer(ii); gap-open-penalty= -ii*ssgapd
637 par.loc=1; // local vs. global alignment as default
638 par.altali=2; // find up to two (possibly overlapping) subalignments
639 par.forward=0; // 0: Viterbi algorithm; 1: Viterbi+stochastic sampling; 3:Maximum Accuracy (MAC) algorithm
640 par.realign=1; // realign with MAC algorithm
642 par.repmode=0; // repeats score independently of one another
643 par.columnscore=1; // Default column score is 1: null model pnul = 1/2 * (q_av(a)+p_av(a))
644 par.min_overlap=0; // automatic minimum overlap used
645 par.opt=0; // Default = optimization mode off
646 par.readdefaultsfile=0; // Default = do not read a defaults file ./.hhdefaults or HOME/.hhdefaults
647 par.maxdbstrlen=200; // maximum length of database string to be printed in 'Command' line of hhr file
649 par.idummy=par.jdummy=0; //
651 par.notags=1; // neutralize His-tags, FLAG-tags, C-myc-tags
653 // Initialize strings
654 strcpy(par.infile,"stdin");
655 strcpy(par.outfile,"");
656 strcpy(par. pairwisealisfile,"");
657 strcpy(par.buffer,"buffer.txt");
658 strcpy(par.scorefile,"");
659 strcpy(par.wfile,"");
660 strcpy(par.alnfile,"");
661 strcpy(par.hhmfile,"");
662 strcpy(par.psifile,"");
665 #if 0 /* read parameter file from home-dir */
666 #include "hhutil-C-help.h"
667 #endif /* read parameter file from home-dir */
670 } /** this is the end of SetDefaults() **/