#include #include #include #include #include #include #include #include "io_lib_header.h" #include "util_lib_header.h" #include "dp_lib_header.h" #include "define_header.h" #include "dev1_lib_header.h" //JM_STRAT #define ACTION(x) ((n_actions>=(x+1))?action_list[x]:NULL) #define ACTION2(x,y) ((n_actions>=(x+1))?action_list[x]:y) #define ATOI_ACTION(x) ((ACTION(x)!=NULL)?(atoi(ACTION(x))):0) /**************************************************************************************************/ /***************************** SEQ_REFORMAT ******************************************/ /**************************************************************************************************/ int output_transitions(char *outfile, Alignment *A); static int output_age_matrix ( char *outfile, int val); int SeqGCGCheckSum(char *seq, int len); static Sequence *seq2year ( Sequence *S, int modulo); static Sequence* output_n_pavie_age_channel (Sequence *S, char *name, int n); static Sequence* output_pavie_age_channel (Sequence *S, char *name, int modulo); static int output_seq2struc(char *outfile, Alignment *A); void output_conservation_statistics ( char *file, Alignment *A); /**************************************************************************************************/ /***************************** SEQ_REFORMAT ******************************************/ /**************************************************************************************************/ int seq_reformat ( int argc, char **in_argv) { Sequence_data_struc *D1=NULL; Sequence_data_struc *D2=NULL; Sequence_data_struc *D_ST=NULL; Action_data_struc *RAD; int a, b; char *in_format; char *in2_format; char *out_format; char *in_file; char *in2_file; char *out_file; char *out2_file; char *struc_in_format; char *struc_out_format; char *struc_in_file; char *struc_out_file; char**action_list; char **action; char *rename_file; char *cache; char ***rename_list=NULL; int code=CODE; char **argv; int n_actions=0; int print_format=0; /*INITIALIZATIONS*/ RAD=vcalloc ( 1, sizeof ( Action_data_struc)); RAD->keep_case=1; declare_name (cache);sprintf ( cache, "use"); declare_name(in_file); declare_name(in2_file); declare_name(out_file); declare_name(out2_file); declare_name(struc_in_format); declare_name(struc_out_format); declare_name(RAD->coor_file); declare_name(struc_in_file); declare_name(struc_out_file); declare_name(in_format); declare_name(in2_format); declare_name(out_format); declare_name(rename_file); argv=break_list ( in_argv, &argc, "=;, \n"); action_list=declare_char ( 100, 100); /*END INITIALIZATION*/ addrandinit ( (unsigned long) 500); if ( argc==1 || strm6 ( argv[1], "h", "-h", "help", "-help", "-man", "?")) { fprintf ( stdout, "\n%s (%s,%s,%s [%s])\n",PROGRAM, VERSION,AUTHOR, DATE, URL); fprintf ( stdout, "\n*********** MINIMUM SYNTAX *****************"); fprintf ( stdout, "\nseq_reformat -in -output "); fprintf ( stdout, "\nSome File formats are automatically recognised"); fprintf ( stdout, "\nSee Format section"); fprintf ( stdout, "\n"); fprintf ( stdout, "\n*********** MAIN FLAGS ******************"); fprintf ( stdout, "\n-in name........Name of the file read"); fprintf ( stdout, "\n-input format......Name of the format read, see Input Format Section"); fprintf ( stdout, "\n...................Automatic detection, except for seqs of numbers"); fprintf ( stdout, "\n...................-input number_aln | number_fasta"); fprintf ( stdout, "\n-in2 fname......Second alignment"); fprintf ( stdout, "\n-input2 format.....See -input"); fprintf ( stdout, "\n-exon_boundaries obj file"); fprintf ( stdout, "\n-out fname......Output file (defualt is STDOUT"); fprintf ( stdout, "\n-output format.....Output Format, default is fasta_aln"); fprintf ( stdout, "\n-struc_in name...File containing a coded aln"); fprintf ( stdout, "\n-struc_in_f format.See -input and output format section"); fprintf ( stdout, "\n-struc_out fname..Name of the output structure"); fprintf ( stdout, "\n-struc_out_f symbol"); fprintf ( stdout, "\n-keep_case=on|off..keep case, On by default"); fprintf ( stdout, "\n-action +ac1 +ac2..See the action section"); fprintf ( stdout, "\n-rename .....Rename the sequences following indications"); fprintf ( stdout, "\n...................File Format: One couple /line"); fprintf ( stdout, "\n...................Rename order into "); fprintf ( stdout, "\n...................code file: -output code_name"); fprintf ( stdout, "\n-code Rename file to "); fprintf ( stdout, "\n-decode Rename file to "); fprintf ( stdout, "\n-no_warning........Suppresses all warnings"); fprintf ( stdout, "\n-cache.............use,ignore,update,local, DirectoryName"); fprintf ( stdout, "\n"); fprintf ( stdout, "\n*********** REFORMAT ACTIONS *****************"); fprintf ( stdout, "\n +Xaction.............Specifies which file undergoes the action"); fprintf ( stdout, "\n +Xaction.............X=1: -in"); fprintf ( stdout, "\n +Xaction.............X=2: -in2"); fprintf ( stdout, "\n +Xaction.............X=3: -struc_in"); fprintf ( stdout, "\n +name2unique_name....replace duplicated name with name_#"); fprintf ( stdout, "\n +swap_header........,swapp comments: replace comments/name in 1 by in 2"); fprintf ( stdout, "\n +swap_lib_header.F...Replace the sequences in the tc_lib (-in) with those in F"); fprintf ( stdout, "\n .....................F is a legal FASTA file"); fprintf ( stdout, "\n +translate[0-2]......Translate on Frame 0, 1, 2 "); fprintf ( stdout, "\n +translate[3]........longuest ORF on direct strand"); fprintf ( stdout, "\n +translate[4]........longuest ORF on direct+complementary strand"); fprintf ( stdout, "\n +add_scale...addscale below aln"); fprintf ( stdout, "\n +rm_gap n ...........Removes col with n%% gap [n=100]"); fprintf ( stdout, "\n +rmgap_col SEQ1:SEQ2.Removes column with a gap in SEQ [#] "); fprintf ( stdout, "\n +backtranslate.......Random Backtranslation"); fprintf ( stdout, "\n +complement..........Produces the reverse complement"); fprintf ( stdout, "\n +reorder.............Reorders sequences of according to "); fprintf ( stdout, "\n .........random......Random_order"); fprintf ( stdout, "\n .........tree........Tree Order (in2)"); fprintf ( stdout, "\n +reorder_columns.....Reorders sequences of according to "); fprintf ( stdout, "\n .........random......Random_order"); fprintf ( stdout, "\n .........tree..mode..Tree Order (comuted with mode: sarmat, idmat, blosum62mt..."); fprintf ( stdout, "\n +aln2random_aln SCR..Randomize the aln, S: swap sequences names"); fprintf ( stdout, "\n .....................Swap residues within colums"); fprintf ( stdout, "\n .....................Swap residues across the aln"); fprintf ( stdout, "\n +aln2sample......N......"); fprintf ( stdout, "\n +aln2bootstrap...N......"); fprintf ( stdout, "\n +chain...............Identifies all the intermediate sequences from <-in>"); fprintf ( stdout, "\n .....................needed to join every sequence pair in <-in2>"); fprintf ( stdout, "\n +aln2cons mat_name..Ouputs a consensus sequence"); fprintf ( stdout, "\n .....................The consensus is determined using mat"); fprintf ( stdout, "\n .....................By Default, mat=blosum62mt, name=Cons"); fprintf ( stdout, "\n +aln2resindex........Prints the sequence index of each residue in -in for each -in2 sequence"); fprintf ( stdout, "\n +collapse_aln | file name"); fprintf ( stdout, "\n .....................Replaces a group of sequences with its consensus"); fprintf ( stdout, "\n .....................The replacement sequence is named "); fprintf ( stdout, "\n .....................List of sequences can be provided via a file"); fprintf ( stdout, "\n .....................File:>new_name seq1 seq2 seq3...."); fprintf ( stdout, "\n +original_seqnos.....Keep original seqnos [SWITCH]"); fprintf ( stdout, "\n +seqnos..............Print Seqnos [SWITCH]"); fprintf ( stdout, "\n +code_dna_aln........Undocumented") ; fprintf ( stdout, "\n +grep..[NAME|SEQ|COMMENT]..[KEEP|REMOVE]..[string]......"); fprintf ( stdout, "\n .....................Keeps or Removes Sequences matching string"); fprintf ( stdout, "\n +extract_block | |"); fprintf ( stdout, "\n .....................Extract column pos OR [start to end["); fprintf ( stdout, "\n ..................... Format"); fprintf ( stdout, "\n .......................seq start end | seq pos"); fprintf ( stdout, "\n .......................# for comments"); fprintf ( stdout, "\n .......................! seq offset_value (0 by default)"); fprintf ( stdout, "\n .....................Can extract as many positions as needed"); fprintf ( stdout, "\n .....................seq=cons: measure positions on the full aln"); fprintf ( stdout, "\n +cat_aln.............Concatenates the alignments input via -in and -in2"); fprintf ( stdout, "\n +cat_aln.............-if no -in2, -in is expected to be a list of alignments to concatenate"); fprintf ( stdout, "\n +msalist2cat_pwaln.min..max"); fprintf ( stdout, "\n .....................extract all pw projections and conctaenates those\n"); fprintf ( stdout, "\n .....................where id>=min and id<=max\n"); fprintf ( stdout, "\n .....................min and max can be omitted (min=0, max=100)\n"); fprintf ( stdout, "\n +seq2blast ..gather all possible homologues from NR (EBI BLAST)"); fprintf ( stdout, "\n +seq2msa ....makes a standard progressive alignment using matrix"); fprintf ( stdout, "\n +realign_block "); fprintf ( stdout, "\n .....................Realign column c1 to c2 (non inc.) with pg)"); fprintf ( stdout, "\n .....................pg reads fasta and outputs fasta"); fprintf ( stdout, "\n .....................pg -infile= -outfile="); fprintf ( stdout, "\n +extract_seq seq_name (start end seq_name start end...) | filename"); fprintf ( stdout, "\n .....................seq_name='*': every seq"); fprintf ( stdout, "\n .....................start='*' : real start"); fprintf ( stdout, "\n .....................end='*' : real end"); fprintf ( stdout, "\n .....................filename: fasta format"); fprintf ( stdout, "\n +extract_seq_list name1 name2"); fprintf ( stdout, "\n .....................Extracts entire sequences"); fprintf ( stdout, "\n +remove_seq sn1 sn2..Removes sequences sn1, sn2..."); fprintf ( stdout, "\n +remove_seq empty....Removes empty sequences (gap only)"); fprintf ( stdout, "\n +remove_seq unique...Remove all multiple occurences except the first"); fprintf ( stdout, "\n +thread_profile_on_msa "); fprintf ( stdout, "\n .....................Threads a list of profiles on corresponding seq"); fprintf ( stdout, "\n .....................File: >seqname _R_ [nlines]"); fprintf ( stdout, "\n +thread_dna_on_prot_aln"); fprintf ( stdout, "\n .....................-in DNA.seq and -in2 AA.aln"); fprintf ( stdout, "\n +thread_struc_on_aln"); fprintf ( stdout, "\n .....................-in structure and -in2 aln"); fprintf ( stdout, "\n +use_cons............Use the consensus for n[SWITCH]"); fprintf ( stdout, "\n +upper.n|[n1-n2].....n omitted sets everything to upper case"); fprintf ( stdout, "\n .....................To use n: provide a number_aln via:"); fprintf ( stdout, "\n .....................-struc_in -struc_in_f number_aln"); fprintf ( stdout, "\n .....................if use_cons is set n, is read on the cons"); fprintf ( stdout, "\n .....................n: will upper every residue with a value of n in struc_in"); fprintf ( stdout, "\n .....................[n1-n2]: upper residues between n1 and n2"); fprintf ( stdout, "\n +lower n|[n1-n2]....See +upper"); fprintf ( stdout, "\n +switchcase n|[n1-n2]See +upper"); fprintf ( stdout, "\n +color_residue | file"); fprintf ( stdout, "\n .....................File: seq_name pos color"); fprintf ( stdout, "\n .....................color: 0-9"); fprintf ( stdout, "\n +edit_residue | file"); fprintf ( stdout, "\n .....................File: seq_name pos color"); fprintf ( stdout, "\n .....................edit: upper|lower|symbol"); fprintf ( stdout, "\n +keep n|[n1-n2]....Only keep residues that have a score between n1 and n2"); fprintf ( stdout, "\n +invert..............Inverts the sequences: CAT => TAC"); fprintf ( stdout, "\n +rotate name Rotate an MSA, names each sequence name_col#"); fprintf ( stdout, "\n +convert n|[n1-n2] s1 s2 ...."); fprintf ( stdout, "\n +merge_annotation.... "); fprintf ( stdout, "\n .....................Converts residues with your alignment"); fprintf ( stdout, "\n .....................similar to upper"); fprintf ( stdout, "\n .....................s1: ABCDe turns every ABCD into e"); fprintf ( stdout, "\n .....................s1: #e turns any residue into e"); fprintf ( stdout, "\n aln2short_aln L C S..Turns sequences into shorter sequences"); fprintf ( stdout, "\n .....................L: list of residues to keep"); fprintf ( stdout, "\n .....................S: Size of Streches replaced by symbol C"); fprintf ( stdout, "\n +random n l..........Generates N random sequences of len l"); fprintf ( stdout, "\n .....................You must provide a file with -in"); fprintf ( stdout, "\n +count n|[n1-n2] s1 s2...."); fprintf ( stdout, "\n .....................Counts residues with your alignment"); fprintf ( stdout, "\n .....................similar to convert"); fprintf ( stdout, "\n +print_format........prints the format name"); fprintf ( stdout, "\n +keep_name...........Keep the original sequence name on extraction"); fprintf ( stdout, "\n +remove_aa pos Ml Ncycle Random_len"); fprintf ( stdout, "\n .....................Randomly modifies an alignment"); fprintf ( stdout, "\n .....................pos=0: chosen randomly"); fprintf ( stdout, "\n .....................MaxLen of the deletions, Ncycle: number of cycles"); fprintf ( stdout, "\n .....................Random_len: 0 sets the len to maxlen, 1 to a random value"); fprintf ( stdout, "\n +remove_nuc.x........Remove Position 1, 2 or 3 of every codon"); fprintf ( stdout, "\n +evaluate matrix..gop..gep"); fprintf ( stdout, "\n .....................Make a similarity evaluation with matrix"); fprintf ( stdout, "\n .....................use -output=score_ascii, or score_html."); fprintf ( stdout, "\n .....................You can filter on the values"); fprintf ( stdout, "\n +evaluate matrix..gop..gep"); fprintf ( stdout, "\n .....................Make an SP evaluation with matrix"); fprintf ( stdout, "\n .....................Uses Natural Gap penalties"); fprintf ( stdout, "\n .....................gop and gep must be negative"); fprintf ( stdout, "\n .....................use -output=color_ascii, color_html to get a color display"); fprintf ( stdout, "\n.....+evaluate_lat........Make a lateral evaluation with matrix"); fprintf ( stdout, "\n +msa_weight proc.....Computes weights using the procedure"); fprintf ( stdout, "\nRNA analysis Post Processing___________________________________________________"); fprintf ( stdout, "\n +aln2alifold.........Turns the MSA into a consensus structure"); fprintf ( stdout, "\n +add_alifold.........adds an alifold consensus structure"); fprintf ( stdout, "\n +alifold2analyze.mode..mode=stat_cache_list_aln_color_html_ps_usegap"); fprintf ( stdout, "\n .......................stat: compile Number of compensated mutations"); fprintf ( stdout, "\n .......................cache: ascii-code compensated mutations on aln"); fprintf ( stdout, "\n .......................html: color-code compensated mutations on aln"); fprintf ( stdout, "\n .......................aln: mark compensated mutations on stockholm aln"); fprintf ( stdout, "\n .......................usegap: do not ignore positions with gaps"); fprintf ( stdout, "\n +cmp_RNAfold.........compares the sec struc of in1 and in2 (computes them with alifold if missing)"); fprintf ( stdout, "\nMSA Post Processing___________________________________________________"); fprintf ( stdout, "\n +force_aln filename|seq1 res1 seq2 res2"); fprintf ( stdout, "\n .....................Forces residue 1 of seq1 to be aligned with res2 of seq 2"); fprintf ( stdout, "\n .....................In a file, there must be one pair of interaction/line"); fprintf ( stdout, "\n +sim_filter[_aln_Ix_iy_Cz_cw "); fprintf ( stdout, "\n ....................._, aln is assumed"); fprintf ( stdout, "\n ....................._I max identity to seq"); fprintf ( stdout, "\n ....................._i min identity to seq"); fprintf ( stdout, "\n ....................._C max cov on seq"); fprintf ( stdout, "\n ....................._c min cov on seq"); fprintf ( stdout, "\n +trim[_aln_%%%%50_n111_N50_T_Fn_fS_pS_max_sim] [string2]"); fprintf ( stdout, "\n ....................._, aln is assumed"); fprintf ( stdout, "\n ....................._%%%%"); fprintf ( stdout, "\n ....................._max Or _min "); fprintf ( stdout, "\n ....................._cov Or _sim Filter according to the coverage [Def: _sim]"); fprintf ( stdout, "\n ....................._n "); fprintf ( stdout, "\n ....................._N"); fprintf ( stdout, "\n ....................._T Reorder the sequences according to a tree BEFORE triming"); fprintf ( stdout, "\n ....................._Fn Keep only sequences that have AT LEAST ONE residue aligned"); fprintf ( stdout, "\n ......................in the n first and n last columns. "); fprintf ( stdout, "\n ....................._O Remove outlayers that have less than min average sim with other sequences"); fprintf ( stdout, "\n .....................Keeping Sequences: Sequences provided via -in2 will be kept"); fprintf ( stdout, "\n .....................Keeping Sequences: Sequences whose name contains in field fS will be kept"); fprintf ( stdout, "\n ....................._f designates a field"); fprintf ( stdout, "\n ..................... is a Perl regular expression"); fprintf ( stdout, "\n +aln2unalign Mode Penalty Threshold"); fprintf ( stdout, "\n .....................Identifies all the streches less conserved than than the average"); fprintf ( stdout, "\n .....................Mode: lower|number|unalign Act on all the resiues withs score with tree <-in2>)"); fprintf ( stdout, "\n ......................+tree_scan help to get P1 information"); fprintf ( stdout, "\n ......................+aln2tree help to get P2 information"); fprintf ( stdout, "\n .....................-in and -in2 can contain different taxons"); fprintf ( stdout, "\n +treelist2groups.N....count all topologies within a list of trees"); fprintf ( stdout, "\n .....................-in is in fasta format with each name being a newick file"); fprintf ( stdout, "\n .....................-in2 can be a list of sequences used to trim the trees"); fprintf ( stdout, "\n ......................N can be used to unresolve the trees with Depth N"); fprintf ( stdout, "\n +treelist2lti.N.C.....Reports the average stability of each sequence neighborhood"); fprintf ( stdout, "\n ......................Species can be selected via -in2 [Fasta file with Taxon names]"); fprintf ( stdout, "\n ......................OR the sequences observed in C% of the files are kept [Def: C=100]"); fprintf ( stdout, "\n +treelist2seq.C.......Reports the species observed in C% of the trees"); fprintf ( stdout, "\n +treelist2splits......List and counts all the splits in a list of trees"); fprintf ( stdout, "\n ......................splits can be restricted to a list of sequences provided via -in2"); fprintf ( stdout, "\n +treelist2dmat.......outputs a distance matrix for a list of trees"); fprintf ( stdout, "\n +tree_compute n s....Computes a tree using the MSA provided with -in"); fprintf ( stdout, "\n ....................n:0-9, controls the way the MSA is filtered"); fprintf ( stdout, "\n ....................s:pam250mt|blosum62mt|categories|enthropy"); fprintf ( stdout, "\n ....................s:controls the column evaluation in MSA"); fprintf ( stdout, "\n +change_distances.f.f:float, sets all the distances to f in the tree"); fprintf ( stdout, "\n +change_bootstrap n..:n=0 removes all the bootstrap values"); fprintf ( stdout, "\n .....................:n!=0 adds a the value n to every node"); fprintf ( stdout, "\n +tree2dpatree........Replaces tree distances with the minimum %%ID in"); fprintf ( stdout, "\n .....................the depending subgroup. The ID is measured on an"); fprintf ( stdout, "\n .....................-in=TREE -in2=ALN"); fprintf ( stdout, "\n +unroot..............Removes the root in the input tree"); fprintf ( stdout, "\n +tree2group.N.I.P....Reports all the tree subgroup with at most Nseq"); fprintf ( stdout, "\n .....................and at min I%% identity. Output format can be read by"); fprintf ( stdout, "\n .....................collapse_tree. New groups are named P_1, P_2..."); fprintf ( stdout, "\n +collapse_tree.F.....Collapses trees. F is either a file or a list"); fprintf ( stdout, "\n ..................... ..."); fprintf ( stdout, "\n +aln2tree............Computes a tree"); fprintf ( stdout, "\n ..ktupN|aln|sarmat ktupN: match size N to estimate distances"); fprintf ( stdout, "\n .....................aln: Measures distances on aln"); fprintf ( stdout, "\n .....................sarmat: expects in to be a SAR matrix of O and I"); fprintf ( stdout, "\n ..nj | cw............Runs Neighbor Joining OR Cw to compute Tree"); fprintf ( stdout, "\n ..dpa................Turns the tree into a daptree (+tree2dpatree)"); fprintf ( stdout, "\n +node_sort.....Sort leafs of tree n1, by node distance"); fprintf ( stdout, "\nMatrix Analysis___________________________________________________"); fprintf ( stdout, "\n +aln2mat_diaa........computes a dinucleotide matrix on a list of aln"); fprintf ( stdout, "\n +aln2mat.............computes a log odd matrix"); fprintf ( stdout, "\n +seq2lat_mat.........computes a transition matrix on seq provided via -in"); fprintf ( stdout, "\nStructure Analysis___________________________________________________"); fprintf ( stdout, "\n +struc2contacts.A.B D.Displays in capitals all the residues of A"); fprintf ( stdout, "\n ......................Less than D Angs from a residue of B"); fprintf ( stdout, "\n ......................A and B are pdb file, D is a distance in Angs"); fprintf ( stdout, "\n +seq2contacts.A.D.....Identifies all the residues in contact with ligands"); fprintf ( stdout, "\n ......................Ligands are in the FASTA header of struc in"); fprintf ( stdout, "\n ......................>Name _S_ [Target Struc] [Ligand1] [Chain] ..."); fprintf ( stdout, "\n ......................Output: number_fasta: 0=no contact, 1=ligand 1..."); fprintf ( stdout, "\n ......................9: residues in contact with more than 1 ligand"); fprintf ( stdout, "\n ......................Use -output=color_html/ascii to display result"); fprintf ( stdout, "\n +struc2nb...D.........Display a list of all the residues D appart"); fprintf ( stdout, "\n +rm_template...V......Removes _[S|G|R]_[template] to sequence names"); fprintf ( stdout, "\n ......................V: omitted | sequences <=> Output sequences"); fprintf ( stdout, "\n ......................V: template <=> Output templates"); fprintf ( stdout, "\n +add_template.F.......Add _[S|G|R]_[template] to sequence names"); fprintf ( stdout, "\n ......................F can either be a fasta file or an executable"); fprintf ( stdout, "\n ......................F: File: >name _S_ template"); fprintf ( stdout, "\n ......................F: executable: pg -infile= -outfile="); fprintf ( stdout, "\nMatrix Comparison___________________________________________________"); fprintf ( stdout, "\n +mat2cmp...............Returns the correlation coefficient between two matrices"); fprintf ( stdout, "\n .......................-in mat1 -input matrix, -in2 mat2 -input2 matrix"); fprintf ( stdout, "\n*********** INPUT FORMATS: Alignments *****************"); fprintf ( stdout, "\n AUTOMATIC RECOGNITION"); fprintf ( stdout, "\n perl_xxx:............. runs xxx onto the input file"); fprintf ( stdout, "\n xxxx > outfile..xxx reads any formats, outputs fasta"); fprintf ( stdout, "\n amps_aln saga_aln "); fprintf ( stdout, "\n clustal_aln fasta_aln msf_aln "); fprintf ( stdout, "\n dali_aln gotoh_aln pima_aln"); fprintf ( stdout, "\n dialign_aln matrix conc_aln"); fprintf ( stdout, "\n NON AUTOMATIC RECOGNITION (use the -input file to specify the format"); fprintf ( stdout, "\n number_aln newick_tree"); fprintf ( stdout, "\n"); fprintf ( stdout, "\n*********** INPUT FORMATS: Sequences *****************"); fprintf ( stdout, "\n fasta_seq dali_seq pir_seq"); fprintf ( stdout, "\n barton_list_tc amps_sd_scores EST_fasta"); fprintf ( stdout, "\n gor_seq gor_struc number_fasta[*]"); fprintf ( stdout, "\n swissprot tc_lib pdb_struc"); fprintf ( stdout, "\n"); fprintf ( stdout, "\n*********** INPUT FORMATS: Structures *****************"); fprintf ( stdout, "\n rna_number"); fprintf ( stdout, "\n alifold"); fprintf ( stdout, "\n*********** OUTPUT FORMATS: Alignments ******************"); fprintf ( stdout, "\n compressed_aln saga_aln clustal_aln"); fprintf ( stdout, "\n phylip_aln msf_aln fasta_aln "); fprintf ( stdout, "\n pir_aln "); fprintf ( stdout, "\n color_html,color_ps......colored using the struc_in file "); fprintf ( stdout, "\n color_protogene..........colors codons"); fprintf ( stdout, "\n color_exoset.............mixes conservation (gray) and introns (RGB)"); fprintf ( stdout, "\n color_pdf pw_lib_saga_aln tdna_aln"); fprintf ( stdout, "\n thread_dna_on_prot_aln"); fprintf ( stdout, "\n"); fprintf ( stdout, "\n*********** OUTPUT FORMATS: sequence ******************"); fprintf ( stdout, "\n fasta_seq fasta_seq1 gotoh_seq"); fprintf ( stdout, "\n gor_seq cache_id"); fprintf ( stdout, "\n"); fprintf ( stdout, "\n*********** OUTPUT FORMATS: weights ******************"); fprintf ( stdout, "\n constraints saga_pw_sd_weights nseq\n"); fprintf ( stdout, "\n"); fprintf ( stdout, "\n*********** OUTPUT Formats: special ****************"); fprintf ( stdout, "\n len name statistics<_hnrglNL>"); fprintf ( stdout, "\n sim............outputs a similarity matrix based on an id comparison of -in"); fprintf ( stdout, "\n sim_sarmat.....in is sar matrix"); fprintf ( stdout, "\n sim_idscore....makes dp alignment of the sequences using Blosum62mt"); fprintf ( stdout, "\n sim_idscoreDNA.makes dp alignment of the sequences using idmat"); fprintf ( stdout, "\n sim............if -in2 is set: in1 vs in2, idscore"); fprintf ( stdout, "\n code_name......Outputs a compact list of names for code/decode"); fprintf ( stdout, "\n"); fprintf ( stdout, "\n"); return EXIT_SUCCESS; } argv=standard_initialisation (argv, &argc); for ( a=1; a< argc; a++) { if (a==1 && argv[1][0]!='-') { sprintf( in_file, "%s", argv[a]); } else if ( strcmp ( argv[a], "-in_f")==0 ||strm(argv[a],"-input") ) { if ( strcmp ( argv[a], "-in_f")==0) fprintf ( stdout,"\nWARNING: %s deprecated, use -input instead", argv[a]); sprintf ( in_format, "%s", argv[a+1]); a++; } else if ( strcmp ( argv[a], "-cache")==0 ) { sprintf (cache, "%s", argv[a+1]); a++; } else if ( strcmp ( argv[a], "-exon_boundaries")==0 ) { set_string_variable ("exon_boundaries", argv[a+1]); a++; } else if ( strcmp ( argv[a], "-overaln_threshold")==0 ) { set_int_variable ("overaln_threshold", atoi(argv[a+1])); a++; } else if ( strcmp ( argv[a], "-overaln_target")==0 ) { set_int_variable ("overaln_target", atoi(argv[a+1])); a++; } else if ( strcmp ( argv[a], "-overaln_P1")==0 ) { set_int_variable ("overaln_P1", atoi(argv[a+1])); a++; } else if ( strcmp ( argv[a], "-overaln_P2")==0 ) { set_int_variable ("overaln_P2", atoi(argv[a+1])); a++; } else if ( strcmp ( argv[a], "-overaln_P3")==0 ) { set_int_variable ("overaln_P3", atoi(argv[a+1])); a++; } else if ( strcmp ( argv[a], "-overaln_P4")==0 ) { set_int_variable ("overaln_P4", atoi(argv[a+1])); a++; } else if ( strcmp ( argv[a], "-in2_f")==0||strm(argv[a],"-input2") ) { if ( strcmp ( argv[a], "-in_f")==0) fprintf ( stdout,"\nWARNING: %s deprecated, use -input2 instead", argv[a]); sprintf ( in2_format, "%s", argv[a+1]); a++; } else if ( strcmp ( argv[a], "-seqnos")==0) { sprintf (action_list[n_actions++], "seqnos"); } else if ( strcmp( argv[a], "-action")==0) { while ((a+1)keep_case=1; else RAD->keep_case=(strm3(argv[a], "on","ON","On"))?1:0; } else if ( strcmp ( argv[a], "-conv")==0) { if ( strncmp ( argv[a+1],"set",3)==0)RAD->symbol_list=make_symbols (argv[++a],&(RAD->n_symbol)); else { RAD->symbol_list=declare_char (STRING, STRING); while(!NEXT_ARG_IS_FLAG) { sprintf ( RAD->symbol_list[RAD->n_symbol], "%s", argv[++a]); RAD->n_symbol++; } } } else if ( strcmp ( argv[a], "-struc_in_f")==0 ||strcmp ( argv[a], "-input3")==0 ) { sprintf ( struc_in_format, "%s", argv[a+1]); a++; } else if ( strcmp ( argv[a], "-out_f")==0 ||strm(argv[a],"-output") ) { if ( strcmp ( argv[a], "-out_f")==0) fprintf (stdout, "\nWARNING: %s deprecated, use -output instead", argv[a]); sprintf ( out_format, "%s", argv[a+1]); a++; } else if ( strm ( argv[a], "-struc_out_f") || strm ( argv[a], "-output_struc") ) { sprintf ( struc_out_format, "%s", argv[a+1]); a++; } else if ( strcmp (argv[a],"-in")==0) { sprintf( in_file, "%s", argv[a+1]); a++; } else if ( strcmp (argv[a],"-rename")==0) { sprintf( rename_file, "%s", argv[a+1]); a++; } else if ( strcmp (argv[a],"-code")==0) { code=CODE; sprintf( rename_file, "%s", argv[a+1]); a++; } else if ( strcmp (argv[a],"-decode")==0) { code=DECODE; sprintf( rename_file, "%s", argv[a+1]); a++; } else if ( strcmp (argv[a],"-in2")==0) { sprintf( in2_file, "%s", argv[a+1]); a++; } else if ( strcmp (argv[a],"-coor")==0) { sprintf( RAD->coor_file, "%s", argv[a+1]); a++; } else if (strcmp (argv[a],"-out")==0) { sprintf (out_file, "%s", argv[a+1]); a++; } else if (strcmp (argv[a],"-out2")==0) { sprintf (out2_file, "%s", argv[a+1]); a++; } else if ( strcmp (argv[a],"-struc_in")==0 || strcmp (argv[a],"-in3")==0 ) { sprintf( struc_in_file, "%s", argv[a+1]); a++; } else if (strcmp (argv[a],"-struc_out")==0) { sprintf (struc_out_file, "%s", argv[a+1]); a++; } else if ( strcmp ( argv[a], "-rm_gap")==0) { RAD->rm_gap=1; } else if ( strcmp ( argv[a], "-print_format")==0) { print_format=1; } else if ( strcmp ( argv[a], "-no_warning")==0) { set_warning_mode (NO); } else { fprintf ( stdout, "\nUNKNOWN OPTION: %s", argv[a]); myexit(EXIT_FAILURE); } } /****************************************************************/ /* */ /* Data Preparation */ /* */ /* */ /****************************************************************/ prepare_cache (cache); /****************************************************************/ /* */ /* INPUT SEQ/ALN */ /* */ /* */ /****************************************************************/ if ( strm (out_format, "hasch")) { fprintf ( stdout, "%d\n", (int)hash_file(in_file)); return EXIT_SUCCESS; } if ( rename_file[0]) { rename_list=read_rename_file ( rename_file,code); } if ((D1=read_data_structure (in_format, in_file,RAD))!=NULL) { in_format=(in_format && in_format[0])?in_format:identify_seq_format(in_file); if (print_format)fprintf ( stdout, "\nFILE:%s FORMAT:%s\n", in_file, in_format); } else if ( in_file[0]) { fprintf ( stdout, "\nFORMAT of file %s Not Supported[FATAL:%s]\n", in_file, PROGRAM); myexit(EXIT_FAILURE); } if ((D2=read_data_structure (in2_format, in2_file,RAD))!=NULL){if (print_format)fprintf ( stderr, "\nFILE:%s FORMAT:%s\n", in2_file, (in2_format&&in2_format[0])?in2_format:identify_seq_format(in2_file));} else if (!D2 && in2_file[0]) { fprintf ( stderr, "\nFORMAT of file %s Not Supported [FATAL:%s]\n", in2_file, PROGRAM); myexit(EXIT_FAILURE); } /*STRUCTURE INPUT*/ if ((D_ST=read_data_structure (struc_in_format, struc_in_file,RAD))) { if ( D_ST->CL) { Constraint_list *CL; int *entry; CL=D_ST->CL; entry=vcalloc ( LIST_N_FIELDS, sizeof (int)); for (a=0; ane; a++) { entry=extract_entry (entry, a, CL); if ( D_ST->S)(D_ST->S)->seq[entry[SEQ1]][entry[R1]-1]=entry[WE]; } thread_seq_struc2aln (D_ST->A, D_ST->S); } else if ( name_is_in_list ("cons", ((D_ST)->A)->name, ((D_ST)->A)->nseq, 100)); else { D_ST->A=copy_aln ( D1->A, D_ST->A); thread_seq_struc2aln (D_ST->A, D_ST->S); } } else if ((strcmp (struc_in_format, "rna_number")==0) && in_file[0]) { D_ST->RNA_ST=read_rna_struc_number((D1->A),struc_in_file); } else if ( struc_in_format[0] && struc_in_file[0]) { fprintf ( stderr, "\nSTRUC %s UNKNOWN[FATAL]", struc_in_format); myexit(EXIT_FAILURE); } else { D_ST=vcalloc ( 1, sizeof (Sequence_data_struc)); } action=declare_char(100, 100); for ( a=0; a< n_actions;) { if (action_list[a][0]!='+') { fprintf ( stderr, "\nWARNING: Action %s Unknown. Actions start with a +", action_list[a]); myexit (EXIT_FAILURE); } else { b=0; sprintf ( action[b++], "%s", action_list[a++]+1); while ( aA= rename_seq_in_aln(D1->A, rename_list); if (D2)D2->A=rename_seq_in_aln (D2->A, rename_list); if (D_ST)D_ST->A=rename_seq_in_aln (D_ST->A,rename_list); if (D1)D1->T =rename_seq_in_tree (D1->T, rename_list); if (D2)D2->T =rename_seq_in_tree (D2->T, rename_list); if (D_ST)D_ST->T=rename_seq_in_tree (D_ST->T,rename_list); } if ( !out_format[0] && ! struc_out_format[0])sprintf ( out_format, "%s", (in_format && in_format[0])?in_format:"fasta_aln"); main_output ( D1, D2, D_ST, out_format, out_file); main_output ( D1, D2, D_ST, struc_out_format, struc_out_file); return EXIT_SUCCESS; } /**************************************************************************************************/ /***************************** FORMAT GUESSING ******************************************/ /**************************************************************************************************/ Sequence_data_struc *read_data_structure ( char *in_format, char *in_file, Action_data_struc *RAD) { Sequence_data_struc *D; char **seq_name=NULL, **sequences=NULL; int nseq=0, a; D=vcalloc ( 1, sizeof (Sequence_data_struc)); if (!in_file[0])return NULL; if (!in_format[0]) { in_format=identify_seq_format(in_file); } if (!in_format[0])return NULL; D->A=declare_Alignment(NULL); if ( RAD->keep_case)(D->A)->residue_case=KEEP_CASE; D->rm_gap=RAD->rm_gap; sprintf ( D->format, "%s", in_format); sprintf ( D->file, "%s", in_file); if ( strm2(in_format,"saga_aln","clustal_aln")) { read_aln (in_file, D->A); D->S=aln2seq(D->A); } else if ( strm (in_format, "treefile_list")) { int z; D->S=get_tree_file_list(in_file); D->A=seq2aln(D->S, D->A,NO_PAD); } else if ( strm (in_format, "file_list") || strm (in_format, "list")) { D->S=get_file_list(in_file); D->A=seq2aln(D->S, D->A,KEEP_GAP); } else if ( strm (in_format, "fasta_tree")) { int z; D->S=get_fasta_tree (in_file, NULL); D->A=seq2aln(D->S, D->A,NO_PAD); } else if ( strm (in_format, "tree_list") || strm (in_format, "treelist")) { char **line; FILE *seq,*dnd; int n=0; char *seq_file; FILE *fp; Sequence *T; seq_file=vtmpnam(NULL); seq=vfopen (seq_file, "w"); line=file2lines (in_file); fp=vfopen (seq_file, "w"); for ( n=1; nTree_%d\n%s\n", n,line[n]); } vfclose (fp); free_char (line, -1); return read_data_structure ( "fasta_tree",seq_file,RAD); } else if (strm (in_format, "matrix")) { D->M=read_matrice (in_file); } else if (strm4 (in_format, "newick_tree", "newick", "nh", "new_hampshire")) { D->T=main_read_tree (in_file); D->S=tree2seq(D->T, NULL); D->A=seq2aln (D->S,D->A, 0); } else if (strm (in_format, "blast_aln")) { if (read_blast_aln (in_file, D->A)) { D->S=aln2seq(D->A); } else { return NULL; } } else if ( strm( in_format,"number_aln")) { read_number_aln (in_file, D->A); D->S=aln2seq(D->A); } else if ( strm( in_format,"stockholm_aln")) { read_stockholm_aln (in_file, D->A); D->S=aln2seq(D->A); } else if ( strm( in_format,"gotoh_aln")) { read_gotoh_aln (in_file, D->A); D->S=aln2seq(D->A); } else if ( strm ( in_format, "msf_aln")) { read_msf_aln (in_file, D->A); D->S=aln2seq(D->A); } else if ( strm ( in_format, "amps_aln")) { read_amps_aln (in_file, D->A); D->S=aln2seq(D->A); } else if ( strm (in_format, "excel_seq")) { D->S=perl_reformat2fasta ("excel2fasta.pl",in_file); (D->S)->contains_gap=0; D->A=seq2aln(D->S, D->A,RAD->rm_gap); } else if ( strm (in_format, "pavie_seq")) { D->S=perl_reformat2fasta ("pavie2fasta.pl",in_file); (D->S)->contains_gap=0; D->A=seq2aln(D->S, D->A,RAD->rm_gap); } else if ( strncmp (in_format, "perl_",5 )==0) { D->S=perl_reformat2fasta (in_format+5,in_file); (D->S)->contains_gap=0; D->A=seq2aln(D->S, D->A,RAD->rm_gap); } else if ( strm (in_format, "number_fasta")) { D->S=get_fasta_sequence_num (in_file, NULL); (D->S)->contains_gap=0; D->A=seq2aln(D->S, D->A,RAD->rm_gap); } else if ( strm (in_format, "raw_fasta")) { D->S=get_fasta_sequence_raw (in_file, NULL); (D->S)->contains_gap=0; D->A=seq2aln(D->S, D->A,RAD->rm_gap); } else if ( strm2 (in_format, "fasta_aln", "fasta_seq")) { D->S=get_fasta_sequence (in_file, NULL); if ( strcmp (in_format, "fasta_aln")==0)(D->S)->contains_gap=0; D->A=seq2aln(D->S, D->A,RAD->rm_gap); } else if ( strm (in_format, "fasta_tree")) { D->S=get_fasta_tree (in_file, NULL); D->A=seq2aln(D->S, D->A, NO_PAD); } else if ( strm (in_format, "pdb") || strm (in_format, "pdb_struc")) { D->S=get_pdb_sequence (in_file); if ( D->S==NULL) { add_warning (stderr, "FAILED TO find PDB File %s", in_file); myexit (EXIT_FAILURE); } D->A=seq2aln(D->S, D->A,RAD->rm_gap); } else if ( strm2(in_format, "pir_seq", "pir_aln")) { D->S=get_pir_sequence ( in_file,NULL ); seq2aln(D->S, D->A, RAD->rm_gap); } else if ( strm(in_format, "gor_seq") ) { D->S=get_gor_sequence ( in_file,NULL ); seq2aln(D->S, D->A, RAD->rm_gap); } else if ( strm2 ( in_format, "dali_aln", "dali_seq")) { D->S=get_sequence_dali ( in_file); seq2aln(D->S, D->A, RAD->rm_gap); } else if ( strm (in_format, "barton_list_tc")) { get_barton_list_tc_seq ( in_file); } else if ( strm (in_format, "amps_sd_scores")) { D->W=get_amps_sd_scores ( in_file); } else if ( strm ( in_format, "pima_aln")) { D->S=get_pima_sequence ( in_file); seq2aln (D->S, D->A, RAD->rm_gap); } else if ( strm( in_format, "gor_struc")) { D->S=get_struc_gor ( in_file); seq2aln(D->S, D->A, RAD->rm_gap); } else if ( strm( in_format, "dialign_aln")) { D->S=get_dialign_sequence ( in_file); seq2aln (D->S, D->A, RAD->rm_gap); } else if ( strm( in_format, "tc_lib") || strm( in_format, "mocca_lib") || strm( in_format, "lib")) { read_seq_in_list (in_file,&nseq,&sequences,&seq_name); D->S=fill_sequence_struc ( nseq, sequences, seq_name); D->CL=declare_constraint_list ( D->S,NULL, NULL, 0,NULL, NULL); D->CL=read_constraint_list_file(D->CL,in_file); seq2aln (D->S, D->A, RAD->rm_gap); free_char (sequences,-1); free_char (seq_name, -1); } else if ( strm( in_format,"swissprot_seq")) { D->S=get_swissprot_sequence ( in_file,NULL); seq2aln (D->S, D->A, RAD->rm_gap); } else if (strm (in_format, "alifold")) { D->S=read_alifold ( in_file); seq2aln (D->S, D->A,0); } else { return NULL; } if ( D->A) { for ( a=0; a<(D->A)->nseq; a++)sprintf ( (D->A)->file[a], "%s", in_file); } if ( D->S) { for ( a=0; a<(D->A)->nseq; a++)sprintf ( (D->S)->file[a], "%s", in_file); } return D; } Sequence *read_sequences (char *name) { return main_read_seq (name); } Alignment * alifold2aln (char *file) { Sequence *S; S=read_alifold(file); sprintf ( S->seq[0],"%s", S->seq[1]); return seq2aln (S, NULL, 0); } Sequence * read_alifold (char *file) { Sequence *S; char **list; int l; S=declare_sequence (1,count_n_char_in_file (file),2); list=file2lines (file); S->seq[0]=list[1]; S->seq[1]=list[2]; substitute (S->seq[0], "\n", "\0"); substitute (S->seq[0], " ", "\0"); substitute (S->seq[0], "_", STOCKHOLM_STRING); l=strlen (S->seq[0]); substitute (S->seq[1], "\n", "\0"); substitute (S->seq[1], " ", "\0"); substitute (S->seq[1], ".", STOCKHOLM_STRING); S->seq[1][l]='\0'; sprintf (S->name[0], "cons", file); sprintf (S->name[1], "#=GC SS_cons", file); return S; } Sequence * main_read_seq ( char *name) { char *format=NULL; Sequence *S=NULL; Alignment *A=NULL; int a; format=identify_seq_format (name); if ( getenv4debug ("DEBUG_REFORMAT"))fprintf ( stderr, "\n\nFormat %s\n", format); if (format &&strm(format, "fasta_seq")) { S= get_fasta_sequence ( name, NULL); } else if (format &&strm(format, "pir_seq")) S= get_pir_sequence ( name, NULL); else if (format &&strm(format,"swissprot_seq"))S= get_swissprot_sequence (name, NULL); else if (format && strstr (format, "aln")) { A=main_read_aln ( name, NULL); S=aln2seq(A); ungap_seq(S); free_aln(A); } else if ( format && strstr (format, "tc_lib")) { int nseq,b; char **sequences=NULL, **seq_name=NULL; read_seq_in_list (name,&nseq,&sequences,&seq_name); S=fill_sequence_struc ( nseq, sequences, seq_name); for ( b=0; b< S->nseq; b++)sprintf ( S->file[b], "%s",name); free_char (seq_name, -1);free_char (sequences, -1); } else { /*Use The ClustalW routine*/ S=cw_read_sequences (name); } for ( a=0; anseq; a++)sprintf ( S->file[a], "%s", name); vfree(format); ungap_seq(S); S=clean_sequence ( S); return S; } Alignment * main_read_aln ( char *name, Alignment *A) { int a; static char *format; Sequence *S=NULL; Sequence *IN_SEQ; if ( !name)return NULL; else if (!check_file_exists(name)) { if ( !check_file_exists (name+1))return NULL; else if ( name[0]=='A') name++; else if ( name[0]=='S') name++;/*Line Added for the -convert flag of T-Coffee*/ } if (!A)A=declare_aln(NULL); format=identify_seq_format (name); IN_SEQ=A->S; if ((format && strm(format, "saga_aln" )) ||strm(format, "clustal_aln")||strm(format, "t_coffee_aln" ) ) { read_aln ( name, A); } else if (format && strm (format, "conc_aln"))A=input_conc_aln (name,NULL); else if (format &&strm(format, "msf_aln" ))read_msf_aln ( name, A); else if (format &&strm(format, "blast_aln"))read_blast_aln (name, A); else if (format &&(strm(format, "fasta_aln"))) { S=get_fasta_sequence ( name, NULL); S->contains_gap=0; seq2aln (S, A, 0); } else if (format &&strm(format, "pir_aln")) { S=get_pir_sequence ( name, NULL); S->contains_gap=0; seq2aln (S, A, 0); } else if (format && strm(format, "fasta_seq") && A) { S=get_fasta_sequence ( name, NULL); for ( a=1; anseq; a++)if ( strlen (S->seq[a-1])!=strlen (S->seq[a])){free_sequence (S, S->nseq); free_aln (A); return NULL;} S->contains_gap=0; seq2aln (S, A, 0); } else if (format && strm(format, "pir_seq") && A) { S=get_pir_sequence ( name, NULL); for ( a=1; anseq; a++)if ( strlen (S->seq[a-1])!=strlen (S->seq[a])){free_sequence (S, S->nseq); free_aln (A); return NULL;} S->contains_gap=0; seq2aln (S, A, 0); } else { free_aln(A); return NULL; } if ( check_list_for_dup( A->name, A->nseq)) { fprintf ( stderr, "\nWARNING (main_read_aln): %s is duplicated in File %s ", check_list_for_dup( A->name, A->nseq), A->file[0]); A=aln2unique_name_aln(A); } if (IN_SEQ)A->S=IN_SEQ; else if (!A->S){A->S=aln2seq(A);} A->S=ungap_seq(A->S); A=fix_aln_seq(A, A->S); compress_aln (A); for ( a=0; a< A->nseq; a++) sprintf ( A->file[a], "%s", name); A=clean_aln (A); return A; } char * identify_aln_format ( char *file) { /*This function identify known sequence and alignmnent formats*/ return identify_seq_format (file); } char * identify_seq_format ( char *file) { char *format=NULL; /*This function identify known sequence and alignmnent formats*/ if ( format==NULL)format=vcalloc ( 100, sizeof (char)); else format[0]='\0'; if ( !check_file_exists(file)) { fprintf (stderr, "ERROR: %s Does Not Exist [FATAL:%s]\n",file, PROGRAM); myexit (EXIT_FAILURE); } else if ( is_stockholm_aln (file))sprintf (format, "stockholm_aln"); else if ( is_blast_file (file))sprintf ( format, "blast_aln"); else if ( is_pdb_file(file))sprintf ( format, "pdb_struc"); else if ( format_is_msf (file))sprintf ( format, "msf_aln"); else if ( format_is_fasta_seq(file))sprintf ( format, "fasta_seq"); else if ( format_is_fasta_aln(file))sprintf ( format, "fasta_aln"); else if ( format_is_pir_aln (file))sprintf ( format, "pir_aln"); else if ( format_is_pir_seq (file))sprintf ( format, "pir_seq"); else if ( format_is_oligo (file))sprintf ( format, "oligo_aln"); else if ( format_is_swissprot (file))sprintf ( format, "swissprot_seq"); else if ( format_is_saga (file))sprintf ( format, "clustal_aln"); else if ( format_is_conc_aln (file))sprintf ( format, "conc_aln"); else if ( is_lib (file))sprintf ( format, "tc_lib"); else if ( is_lib_02 (file))sprintf ( format, "tc_lib_02"); else if ( is_newick(file))sprintf ( format, "newick_tree"); else { add_warning ( stderr, "\nThe Format of File: %s was not recognized [SERIOUS:%s]",file, PROGRAM); } return format; } char **identify_list_format ( char **list, int n) { int a; char *name; char *string; char mode; declare_name (name); for ( a=0; a< n; a++) { sprintf (name, "%s", list[a]); string=list[a]; if ((mode=identify_format ( &string))!='?') { sprintf ( name, "%s", string); sprintf ( list[a], "%c%s", mode,name); } else { fprintf ( stderr, "\nERROR: %s not recognised [FATAL:%s]", name, PROGRAM); } } vfree(name); return list; } char * name2type_name ( char *name) { /*turns into , ...*/ char *new_name; char mode; new_name=vcalloc ( strlen (name)+2, sizeof (char)); sprintf ( new_name, "%s", name); if (is_in_set (name[0], "ALSMXPRW") && !check_file_exists(name)) { sprintf ( new_name, "%s", name); } else { mode=identify_format (&new_name); sprintf ( new_name, "%c%s", mode,name); } return new_name; } char identify_format (char **fname) { char mode='?'; mode=fname[0][0]; if ((is_in_set (mode, "ALMSPR") && check_file_exists(fname[0]+1)) ||(mode=='X' && is_matrix ( fname[0]+1)) ||(mode=='M' && is_method(fname[0]+1)) ) { fname[0]++; } else if (mode=='W' && !check_file_exists(fname[0])){fname[0]++;} else { /*WARNING: Order matters => internal methods can be confused with files, must be checked last*/ if (is_lib(fname[0]))mode='L'; else if (is_pdb_file(fname[0]))mode='P'; else if (is_seq(fname[0]))mode='S'; else if (is_aln(fname[0]))mode='A'; else if (is_matrix(fname[0]))mode='X'; else if (is_method(fname[0]))mode='M'; else mode='?'; } return mode; } int is_pdb_name ( char *name) { char command[1000]; int result; char *result_file; static char **buf_names; static int *buf_result; static int nbuf; FILE *fp; /*Use the look up*/ if ( !buf_names) { buf_names=declare_char (1000, 100); buf_result=vcalloc (1000, sizeof (int)); } if ( (result=name_is_in_list ( name, buf_names,nbuf,100))!=-1)return buf_result[result]; result_file=vtmpnam (NULL); sprintf ( command, "extract_from_pdb -is_pdb_name \'%s\' > %s", name, result_file); if ( getenv4debug ("DEBUG_EXTRACT_FROM_PDB"))fprintf ( stderr, "\n[DEBUG_EXTRACT_FROM_PDB:is_pdb_name] %s\n", command); my_system ( command); fp=vfopen ( result_file, "r"); fscanf ( fp, "%d", &result); vfclose (fp); vremove ( result_file); sprintf ( buf_names[nbuf], "%s", name); result=buf_result[nbuf++]=(result==1)?1:0; return result; } char* get_pdb_id ( char *file) { /*receives the name of a pdb file*/ /*reads the structure id in the header*/ /*returns the pdb_id*/ char *tmp_name; char command[10000]; char cached [1000]; char fname[1000]; FILE *fp; char *id; char buf[1000]; tmp_name=vtmpnam(NULL); sprintf ( cached, "%s/%s", get_cache_dir(),file); if ( check_file_exists(cached))sprintf ( fname, "%s", cached); else sprintf ( fname, "%s", file); sprintf ( command, "extract_from_pdb -get_pdb_id %s > %s",fname, tmp_name); if ( getenv4debug ("DEBUG_EXTRACT_FROM_PDB"))fprintf ( stderr, "\n[DEBUG_EXTRACT_FROM_PDB:get_pdb_id] %s\n", command); my_system ( command); buf[0]='\0'; fp=vfopen (tmp_name, "r"); fscanf ( fp, "\n%s\n", buf); vfclose (fp); if ( getenv4debug ("DEBUG_EXTRACT_FROM_PDB"))fprintf ( stderr, "\n[DEBUG_EXTRACT_FROM_PDB:get_pdb_id]DONE\n"); id=vcalloc ( strlen (buf)+1, sizeof (char)); sprintf ( id, "%s", buf); return id; } char* get_pdb_struc(char *in_name, int start, int end) { char *name1,*name2; char command[LONG_STRING]; char *name; name=vcalloc ( STRING, sizeof (char)); sprintf ( name, "%s", in_name); if ( (name1=is_pdb_struc(name))==NULL && (name[0]=='P' && ((name1=is_pdb_struc (name+1))==NULL))) { fprintf ( stderr, "\nERROR Could not download structure %s [FATAL:%s]\n", name, PROGRAM);crash(""); } else if ( (start==0) && (end==0))return name1; else { declare_name(name2); sprintf ( name2, "%s_%d_%d.pdb", name, start, end); sprintf ( command, "extract_from_pdb -infile \'%s\' -chain FIRST -coor %d %d > %s%s",check_file_exists(name1),start, end, get_cache_dir(),name2); if ( getenv4debug ("DEBUG_EXTRACT_FROM_PDB"))fprintf ( stderr, "\n[DEBUG_EXTRACT_FROM_PDB:get_pdb_struc] %s\n", command); my_system (command); if ( is_pdb_file(name2))return name2; else { fprintf ( stderr, "\nERROR Could not extract segment [%d %d] from structure %s [FATAL:%s]\n",start, end, name, PROGRAM);crash(""); } exit (EXIT_FAILURE); } return NULL; } char* seq_is_pdb_struc ( Sequence *S, int i) { if (!S){return NULL;} else if ( !S->T[i]){return NULL;} else if ( !((S->T[i])->P)){return NULL;} else return ((S->T[i])->P)->template_file; } char* is_pdb_struc ( char *name) { /*Receives a name checks if this is the name of a local file that contains PDB data checks if this is the name of a file from a local db put the file in the cache checks if this is a file from a remote db (extract_from_pdb return NULL if everything fails */ static char *file_name1; static char *file_name2; static char **buf_names; static char **buf_result; static int nbuf, s; char *r=NULL; char command[1000]; if ( !name || name[0]=='\0')return NULL; /*Use the look up*/ if ( !buf_names) { buf_names=vcalloc ( 1000, sizeof (char*)); buf_result=vcalloc ( 1000, sizeof (char*)); file_name1=vcalloc ( 1000, sizeof (char)); file_name2=vcalloc ( 1000, sizeof (char)); } if ( (s=name_is_in_list ( name, buf_names,nbuf,-1))!=-1)return buf_result[s]; r=NULL; sprintf ( file_name1, "%s", name); sprintf ( file_name2, "%s.pdb", name); if (is_pdb_file(file_name1)){r=file_name1;} else if (is_pdb_file(file_name2)){r=file_name2;} else if (is_pdb_name (name)) { char *tmpname; tmpname=vtmpnam (NULL); sprintf ( file_name2, "%s.pdb", name); /*sprintf ( command, "extract_from_pdb -netfile \'%s\' > %s%s 2>/dev/null",name, get_cache_dir(), file_name2);*/ sprintf ( command, "extract_from_pdb -netfile \'%s\' > %s 2>/dev/null",name,tmpname); if ( getenv4debug ("DEBUG_EXTRACT_FROM_PDB"))fprintf ( stderr, "\n[DEBUG_EXTRACT_FROM_PDB:is_pdb_struc] %s\n", command); my_system (command); sprintf ( command, "cp %s %s%s", tmpname, get_cache_dir(), file_name2); my_system (command); if ( is_pdb_file(file_name2))r=file_name2; else r=NULL; } /*Fill the buffer*/ buf_names[nbuf]=vcalloc ( strlen (name)+1, sizeof (char)); sprintf ( buf_names[nbuf], "%s", name); if ( r) { buf_result[nbuf]=vcalloc ( strlen (r)+1, sizeof (char)); sprintf (buf_result[nbuf], "%s", r); } else buf_result[nbuf]=NULL; nbuf++; return r; } char *fix_pdb_file ( char *in) { char *empty; empty=vcalloc(1, sizeof(char)); if ( !in || !check_file_exists (in))return empty; else if ( is_pdb_file(in))return in; else { char command[10000]; char *tmp; char *tmp2; tmp=vtmpnam (NULL); tmp2=vcalloc (strlen (tmp)+1, sizeof (char)); sprintf (tmp2, "%s", tmp); sprintf ( command, "extract_from_pdb %s > %s", check_file_exists(in), tmp2); my_system (command); if ( is_pdb_file (tmp))return tmp2; else return empty; } } int is_sap_file ( char *name) { FILE *fp; if (!name); if (!check_file_exists(name))return 0; if ((fp=find_token_in_file (name, NULL, "Percent"))!=NULL) { if ((fp=find_token_in_file (name,fp, "Percent"))!=NULL) { vfclose (fp); return 1; } else { return 0; } } else { return 0; } } int is_blast_file ( char *name) { if ( !check_file_exists(name) ) return 0; else if (token_is_in_file (name, "")) { return BLAST_XML; } else { if (token_is_in_file (name, "Lambda")) { return BLAST_TXT; } else { return 0; } } return 0; } int is_simple_pdb_file ( char *name) { FILE *fp; if ((fp=find_token_in_file (name, NULL, "SIMPLE_PDB_FORMAT"))!=NULL){vfclose (fp);return 1;} return 0; } int is_pdb_file ( char *name) { FILE *fp; int ispdb=0; if ( name==NULL) return 0; if (!check_file_exists (name))return 0; if ((fp=find_token_in_file (name, NULL, "\nHEADER"))!=NULL) {vfclose (fp); ispdb++; } if ((fp=find_token_in_file (name, NULL, "\nSEQRES"))!=NULL) {vfclose (fp); ispdb++; } if ((fp=find_token_in_file (name, NULL, "\nATOM"))!=NULL) { vfclose (fp); ispdb++; } else { ispdb=0; } if ( ispdb>=2)return 1; else return 0; } int is_seq ( char *name) { char *format; if ( !check_file_exists(name))return 0; format= identify_seq_format(name); if(!format || format[0]=='\0'){vfree (format);return 0;} else if (strstr(format, "seq")){vfree (format);return 1;} else return 0; } int is_aln ( char *name) { char *format; if ( !check_file_exists (name))return 0; format= identify_seq_format(name); if ( !format || format[0]=='\0'){vfree (format);return 0;} else if (strstr(format, "aln")){vfree (format); return 1;} else return 0; } int is_matrix (char *name) { int **m; if ((m=read_matrice (name))!=NULL){free_int (m, -1); return 1;} return 0; } int is_newick (char *name) { int c; FILE *fp; fp=vfopen (name, "r"); if ( (c=fgetc(fp))!='('){vfclose (fp); return 0;} while ( (c=fgetc(fp))!=EOF) { if ( c==';'){vfclose (fp); return 1;} } vfclose (fp); return 0; } int is_clustalw_matrix ( char *name) { FILE *fp; if ( (fp=find_token_in_file (name, NULL, "CLUSTALW_MATRIX"))!=NULL){vfclose(fp);return 1;} else return 0; } int is_pavie_matrix ( char *name) { FILE *fp; if ( (fp=find_token_in_file (name, NULL, "PAVIE_MATRIX"))!=NULL){vfclose(fp);return 1;} else return 0; } int is_distance_matrix_file (char *name) { FILE *fp; if ( (fp=find_token_in_file (name, NULL, "TC_DISTANCE_MATRIX_FORMAT_01"))!=NULL){vfclose(fp);return 1;} else return 0; } int is_similarity_matrix_file (char *name) { FILE *fp; if ( (fp=find_token_in_file (name, NULL, "TC_SIMILARITY_MATRIX_FORMAT_01"))!=NULL){vfclose(fp);return 1;} else return 0; } int is_blast_matrix ( char *name) { FILE *fp; if ( (fp=find_token_in_file (name, NULL, "BLAST_MATRIX"))!=NULL){vfclose(fp);return 1;} else return 0; } int is_single_seq_weight_file ( char *name) { return token_is_in_file ( name, "SINGLE_SEQ_WEIGHT_FORMAT_01"); } int is_stockholm_aln (char *file) { FILE *fp; if ((fp=find_token_in_file_nlines (file, NULL, "STOCKHOLM",2))) { vfclose (fp); return 1; } return 0; } int is_lib ( char *name) { return is_lib_01(name); } int is_lib_02 ( char *name) { return token_is_in_file ( name, "TC_LIB_FORMAT_02"); } int is_lib_01 (char *name) { if ( token_is_in_file ( name, "TC_LIB_FORMAT_01")) return 1; else if (token_is_in_file ( name, "T-COFFEE_LIB_FORMAT_01"))return 1; else if (token_is_in_file (name, "SEQ_1_TO_N"))return 1; else return 0; } int is_lib_list ( char *name) { if ( !check_file_exists (name))return 0; if ( token_is_in_file ( name, "TC_LIB_LIST_FORMAT_01")) return 1; return 0; } int is_method ( char *file) { char new_file[200]; sprintf ( new_file, "%s", file); if ( (token_is_in_file(new_file, "TC_METHOD_FORMAT_01"))){return 1;} if ( is_in_pre_set_method_list(new_file)) { vremove ( new_file); return 1; } else { return 0; } } /*******************************************************************************************/ /* */ /* */ /* SEQUENCE FORMAT IDENTIFIERS */ /* */ /***************************************************************************************** */ int type_is_exon_boundaries(char **seq, int n) { int a, l, b; for (a=0; aseq[0]){free_sequence (S, S->nseq); return 1;} l=strlen ( S->seq[0]); for ( a=0; a< S->nseq; a++)if(strlen(S->seq[a])!=l){free_sequence (S, S->nseq);return 1;} for ( a=0; a< S->nseq; a++) { l1=strlen ( S->seq[a]); ungap (S->seq[a]); l2=strlen ( S->seq[a]); if ( l1!=l2) { free_sequence (S, S->nseq); return 0; } } free_sequence (S, S->nseq); return 1; } else { return 0; } } int format_is_fasta ( char *file) { Sequence *S; if ( !check_file_exists(file))return 0; if ( get_first_non_white_char (file)!='>')return 0; if ( !(S=get_fasta_sequence (file, NULL)))return 0; free_sequence (S, -1); if ( format_is_pir(file)) return 0; return 1; } int format_is_pir_aln ( char *file) { if ( format_is_pir(file) && !format_is_pir_seq(file))return 1; else return 0; } int format_is_pir_seq ( char *file) { int a, l1, l2; Sequence *S; if ( format_is_pir (file)) { S=get_pir_sequence (file, NULL); for ( a=0; a< S->nseq; a++) { l1=strlen ( S->seq[a]); ungap (S->seq[a]); l2=strlen ( S->seq[a]); if ( l1!=l2) { free_sequence (S, S->nseq); return 0; } } return 1; } else { return 0; } } int format_is_pir ( char *file) { Sequence *S; int pir_name=1, star_end=1, a; S=get_fasta_sequence (file, NULL); if (!S)return 0; else if (!S->seq[0])return 0; pir_name=1; star_end=1; for (a=0; a< S->nseq; a++) { int l; if (!is_pir_name(S->name[a]))pir_name=0; l=strlen (S->seq[a]); if (!l || (l && S->seq[a][l-1]!='*')) star_end=0; } free_sequence(S,-1); if ( pir_name && star_end) return 1; else return 0; } int is_pir_name (char *name) { if ( strstr (name, "P1;"))return 1; if ( strstr (name, "F1;"))return 1; if ( strstr (name, "DL;"))return 1; if ( strstr (name, "DC;"))return 1; if ( strstr (name, "RL;"))return 1; if ( strstr (name, "RC;"))return 1; if ( strstr (name, "XX;"))return 1; return 0; } int format_is_conc_aln (char *file) { FILE *fp; if ( (fp=find_token_in_file (file, NULL, "CONC_MSF_FORMAT_01"))){vfclose (fp); return 1;} return 0; } int format_is_saga ( char *file) { FILE *fp; int **list; int n_blocks; int n_seq; int a, b; if ( (fp=find_token_in_file (file, NULL, "SAGA"))){vfclose (fp); return 1;} else if ((fp=find_token_in_file (file, NULL, "CLUSTAL"))){vfclose (fp); return 1;} else if ((fp=find_token_in_file (file, NULL, "ClustalW"))){vfclose (fp); return 1;} else if ((fp=find_token_in_file (file, NULL, "clustalw"))){vfclose (fp); return 1;} else if ((fp=find_token_in_file (file, NULL, "clustal"))){vfclose (fp); return 1;} else if ((fp=find_token_in_file (file, NULL, "T-COFFEE_MSA"))){vfclose (fp); return 1;} else if ((fp=find_token_in_file (file, NULL, "INTERLEAVED_MSA"))){vfclose (fp); return 1;} else return 0; if (1==1); else if ((fp=find_token_in_file (file, NULL, "T-COFFEE"))){vfclose (fp); return 1;} else if ((fp=find_token_in_file (file, NULL, "SAGA_FORMAT"))){vfclose (fp); return 1;} else if ((fp=find_token_in_file (file, NULL, "GARP"))){vfclose (fp); return 1;} else if ((fp=find_token_in_file (file, NULL, "INTERLEAVED"))){vfclose (fp); return 1;} else { list=get_file_block_pattern (file,&n_blocks,100); if (n_blocks<=2){free_int (list, -1);return 0;} else { n_seq=list[1][0]; for ( a=1; a< n_blocks-1; a++) { if ( list[a][0]!=n_seq){free_int (list, -1);return 0;} else { for ( b=1; b<=list[a][0]; b++) if ( list[a][b]!=2){free_int (list, -1);return 0;} } } } return 1; } return 0; } int format_is_swissprot (char *name) { FILE *fp; if ( !check_file_exists(name))return 0; if ( (fp=find_token_in_file_nlines (name,NULL,"\nID ",10))!=NULL\ &&(fp=find_token_in_file (name,NULL,"\nSQ "))!=NULL ) { vfclose (fp);return 1; } else { return 0; } } /*******************************************************************************************/ /* */ /* */ /* OUTPUT STUFF */ /* */ /***************************************************************************************** */ int output_format_aln ( char *format, Alignment *inA, Alignment *inEA,char *name) { Sequence_data_struc *D1=NULL; Sequence_data_struc *D2=NULL; Alignment *A=NULL; Alignment *EA=NULL; A =copy_aln (inA, NULL); A->CL=inA->CL; EA=copy_aln (inEA,NULL); A =expand_aln(A); EA=expand_number_aln(inA,EA); if (A && A->expanded_order )A=reorder_aln ( A, A->expanded_order,A->nseq); if (EA && EA->expanded_order)EA=reorder_aln ( EA, EA->expanded_order,EA->nseq); D1=vcalloc ( 1, sizeof (Sequence_data_struc)); D1->A=A; if (EA) { D2=vcalloc ( 1, sizeof (Sequence_data_struc)); D2->A=EA; } main_output ( D1, NULL,D2, format, name); vfree(D1); vfree(D2); free_aln (A); free_aln (EA); return 1; } int main_output (Sequence_data_struc *D1, Sequence_data_struc *D2, Sequence_data_struc *DST, char *out_format, char *out_file) { FILE *fp; int value; Alignment *BUF_A; int expanded=0; if ( !out_format[0])return 0; if ( D1 && D1->rm_gap)ungap_aln ((D1->A)); if ( (strstr (out_format, "expanded_"))) { if (!D1) return 1; out_format+=strlen ("expanded_"); BUF_A=copy_aln (D1->A, NULL); (D1->A)=thread_profile_files2aln ((D1->A), NULL, NULL); expanded=1; } if ( strm (out_format, ""))return 0; else if ( ( strm (out_format, "aln2lib"))) { int a, b, c; int r1,r2,s1, s2,s; Constraint_list *CL; FILE *fp; Alignment *IN; int **pos; if (!D1)return 1; IN=D1->A; CL=(D1->A)->CL; pos=aln2pos_simple(IN, IN->nseq); fp=vfopen (out_file, "w"); fp=save_list_header (fp,CL); for ( b=0; b< IN->nseq-1; b++) { for ( c=b+1; c< IN->nseq; c++) { s1=IN->order[b][0]; s2=IN->order[c][0]; fprintf ( fp, "#%d %d\n", s1+1, s2+1); for ( a=0; a< IN->len_aln; a++) { r1=pos[b][a]; r2=pos[c][a]; if ( s1==s2 && !CL->do_self)continue; if ( s1< s2)s=(CL->evaluate_residue_pair)( CL, s1, r1, s2, r2); else s=(CL->evaluate_residue_pair)( CL, s2, r2, s1, r1); s=(s!=UNDEFINED)?s:0; if ( r1>0 && r2>0) { fprintf (fp, "\t%5d %5d %5d \n", r1, r2, s); } } } } vfclose (save_list_footer (fp, CL)); } else if ( strncmp (out_format, "score",5)==0 || strm (out_format, "html")) { Alignment *BUF; if (!D1)return 1; if ( !DST) { fprintf ( stderr,"\n[You Need an evaluation File: Change the output format or use +evaluate][FATAL:%s]\n", PROGRAM); myexit(EXIT_FAILURE); } if ( !strm ("html", out_format))while ( out_format[0]!='_' && out_format[0]!='\0' )out_format++; D1->S=aln2seq(D1->A); BUF=copy_aln (DST->A, NULL); DST->A=aln2number (DST->A); if ( strstr ( out_format, "html" ))output_reliability_html ( D1->A, DST->A, out_file); else if( strm ( out_format, "_ps" ))output_reliability_ps ( D1->A, DST->A, out_file); else if( strm ( out_format, "_pdf" ))output_reliability_pdf ( D1->A, DST->A, out_file); else if( strm ( out_format, "_ascii" ))output_reliability_ascii ( D1->A, DST->A, out_file); else if( strm ( out_format, "_seq" ))output_seq_reliability_ascii ( D1->A, DST->A, out_file); else { DST->A=BUF; main_output (DST, NULL, NULL, out_format+1, out_file); } } else if (strm (out_format, "sec_html") || strm (out_format, "_E_html")) { Alignment *ST, *A; Sequence *S; int a, b,c,i, ns=0; char *buf; if (!D1)return 1; A=D1->A; S=A->S; ST=copy_aln (A, NULL); for (a=0; anseq; a++) { i=name_is_in_list (ST->name[a],S->name, S->nseq, 100); if ( i!=-1) { buf=seq2E_template_string(S, i); if ( buf==NULL)continue; else ns++; for (c=0,b=0; blen_aln; b++) { int r1, s; r1=ST->seq_al[a][b]; if ( r1!='-') { s=tolower (buf[c]); if (s=='e')r1='0'; else if (s=='h')r1='9'; else if (s=='c')r1='5'; c++; } ST->seq_al[a][b]=r1; } } } if (!ns) { printf_exit ( EXIT_FAILURE, stderr, "\nYou must provide a TM template file [FATAL:%s]", PROGRAM); } output_color_html ( A, ST, out_file); } else if (strm (out_format, "tm_html") || strm (out_format, "_T_html")) { Alignment *ST, *A; Sequence *S; int a, b,c,i, ns=0; char *buf; if (!D1)return 1; A=D1->A; S=A->S; ST=copy_aln (A, NULL); for (a=0; anseq; a++) { i=name_is_in_list (ST->name[a],S->name, S->nseq, 100); if ( i!=-1) { buf=seq2T_template_string(S, i); if ( buf==NULL)continue; else ns++; for (c=0,b=0; blen_aln; b++) { int r1, s; r1=ST->seq_al[a][b]; if ( r1!='-') { s=tolower (buf[c]); if (s=='o')r1='0'; else if (s=='h')r1='9'; else if (s=='i')r1='5'; c++; } ST->seq_al[a][b]=r1; } } } if (!ns) { printf_exit ( EXIT_FAILURE, stderr, "\nYou must provide a TM template file [FATAL:%s]", PROGRAM); } output_color_html ( A, ST, out_file); } else if (strm (out_format, "color_exoset")) { Alignment *ST, *EX, *A; Constraint_list *CL; int a, b, n; char *buf; if ( !DST->A) { printf_exit ( EXIT_FAILURE, stderr, "\nYou must provide an obj file via the -struc_in flag [FATAL:%s]", PROGRAM); } EX=DST->A; A=D1->A; CL=declare_constraint_list ( DST->S,NULL, NULL, 0,NULL, read_matrice("pam250mt")); ST=copy_aln (A, NULL); buf=vcalloc ( EX->len_aln+1, sizeof (int)); for ( a=0; a< A->nseq; a++) { int i; i=name_is_in_list (A->name[a],EX->name, EX->nseq, -1); if ( i==-1)continue; sprintf ( buf, "%s", EX->seq_al[i]); ungap (buf); for (n=0,b=0; blen_aln; b++) { if (!is_gap(A->seq_al[a][b])) { if ( buf[n]=='o') ST->seq_al[a][b]='0'; else if ( buf[n]=='j') ST->seq_al[a][b]='1'; else if ( buf[n]=='b') ST->seq_al[a][b]='2'; n++; } } } vfree (buf); output_color_html ( A, ST, out_file); return EXIT_SUCCESS; } else if (strm (out_format, "color_protogene")) { int n, a, b; DST->A=copy_aln (D1->A, NULL); for (n=1,a=0; a< (D1->A)->len_aln; a++, n++) { for ( b=0; b<(D1->A)->nseq; b++) { if (is_gap((D1->A)->seq_al[b][a])); else if ( n<=3)(DST->A)->seq_al[b][a]=2; else if ( n>3)(DST->A)->seq_al[b][a]=9; } if ( n==6)n=0; } output_color_html ( D1->A, DST->A, out_file); return EXIT_SUCCESS; } else if ( strncmp (out_format, "color",5)==0) { Alignment *BUF; if (!D1)return 1; if ( !DST) { fprintf ( stderr,"\n[You Need an evaluation File: Change the output format or use +evaluate][FATAL:%s]\n", PROGRAM); myexit(EXIT_FAILURE); } while ( out_format[0]!='_' && out_format[0]!='\0' )out_format++; BUF=copy_aln (DST->A, NULL); if ( strm ( out_format, "_html" ))output_color_html ( D1->A, DST->A, out_file); else if( strm ( out_format, "_ps" ))output_color_ps ( D1->A, DST->A, out_file); else if( strm ( out_format, "_pdf" ))output_color_pdf ( D1->A, DST->A, out_file); else if( strm ( out_format, "_ascii" ))output_color_ascii ( D1->A, DST->A, out_file); else { DST->A=BUF; return main_output (DST, NULL, NULL, out_format+1, out_file); } return EXIT_SUCCESS; } else if ( strm4 ( out_format, "tc_aln","t_coffee_aln", "t_coffee", "tcoffee")) { if (!D1)return 1; vfclose (output_aln ( D1->A, vfopen (out_file, "w"))); } else if ( strm ( out_format, "analyse_pdb")) { if (!D1)return 1; if ( !DST) { fprintf ( stderr,"\n[You Need an evaluation File: Change the output format][FATAL:%s]\n", PROGRAM); myexit(EXIT_FAILURE); } analyse_pdb ( D1->A,DST->A, "stdout"); (DST->A)=aln2number (DST->A); output_reliability_ps ( D1->A, DST->A, out_file); } else if ( strm4 ( out_format, "lower0", "lower1", "lower2", "lower3") || strm4(out_format, "lower4", "lower5", "lower6", "lower7") || strm4 (out_format,"lower8", "lower9", "align_pdb", "malign_pdb") ) { if (!D1)return 1; if ( !DST) { fprintf ( stderr,"\n[You Need an evaluation File: Change the output format][FATAL:%s]\n", PROGRAM); myexit(EXIT_FAILURE); } (DST->A)=aln2number (DST->A); if ( strm (out_format, "align_pdb"))value=0; else if ( strm (out_format, "malign_pdb"))value=5; else value=atoi(out_format+5); D1->A=filter_aln_upper_lower (D1->A, DST->A,0, value); output_clustal_aln ( out_file, D1->A); } else if ( strnm (out_format, "repeat", 6)) { int size; int a, b, c; Alignment *CONC; if ( !D1)return 1; size=atoi (out_format+6); print_aln (D1->A); CONC=declare_aln2 ( (D1->A)->nseq, ((D1->A)->len_aln+1)*size+1); for ( a=0; a< (D1->A)->nseq; a++)(D1->A)->seq_al[a][(D1->A)->len_aln]='\0'; for ( c=0,a=0; a< (D1->A)->nseq;c++) { sprintf ( CONC->name[c], "%s", (D1->A)->name[a]); for ( b=0; bseq_al[c], (D1->A)->seq_al[a]); strcat (CONC->seq_al[c], "O"); } } CONC->nseq=c;CONC->len_aln=strlen (CONC->seq_al[0]); output_clustal_aln ( out_file, CONC); free_aln (CONC); } else if ( strnm (out_format, "upper", 5)) { if (!D1)return 1; if ( !DST) { fprintf ( stderr,"\n[You Need an evaluation File: Change the output format][FATAL:%s]\n", PROGRAM); myexit(EXIT_FAILURE); } (DST->A)=aln2number (DST->A); value=atoi(out_format+5); D1->A=filter_aln_lower_upper (D1->A, DST->A,0, value); output_clustal_aln ( out_file, D1->A); } else if ( strm4 ( out_format, "filter0", "filter1", "filter2", "filter3")) { if (!D1)return 1; if ( !DST) { fprintf ( stderr,"\n[You Need an evaluation File: Change the output format][FATAL:%s]\n", PROGRAM); myexit(EXIT_FAILURE); } (DST->A)=aln2number (DST->A); D1->A=filter_aln (D1->A, DST->A, atoi(out_format+6)); output_clustal_aln ( out_file, D1->A); } else if ( strm3 ( out_format, "phylip_aln", "phylip", "phy")) { if (!D1)return 1; output_phylip_aln ( out_file, D1->A); } else if ( strm ( out_format, "mocca_aln")) { if (!D1)return 1; output_mocca_aln ( out_file, D1->A, DST->A); } else if ( strm ( out_format, "saga_pw_sd_weights") ) { if (!D1)return 1; output_pw_weights4saga ((D1->W),(D1->W)->PW_SD, out_file); } else if ( strm ( out_format, "saga_aln")) { if (!D1)return 1; output_saga_aln (out_file, D1->A); } else if (strm2 ( out_format, "aln","clustal_tc")|| strm (out_format, "msa")) { if (!D1)return 1; output_clustal_aln (out_file, D1->A); } else if (strm5 ( out_format, "strict_clustal","clustal_aln", "clustalw","clustal", "clustalw_aln") || strm (out_format,"number_aln")) { if (!D1)return 1; output_strict_clustal_aln (out_file, D1->A); } else if ( strm ( out_format, "conc_aln")) { if (!D1)return 1; output_conc_aln (out_file, D1->A); } else if ( strm2 ( out_format, "lalign_aln","lalign")) { if (!D1)return 1; output_lalign (out_file, D1->A); } else if ( strm2 ( out_format, "glalign_aln","glalign")) { if (!D1)return 1; output_glalign (out_file, D1->A, DST->A); } else if ( strm2 ( out_format, "fasta_aln","fasta" ) || strm (out_format, "blast_aln")) { if (!D1)return 1; output_fasta_aln( out_file, D1->A); } else if ( strm (out_format, "overaln")) { Alignment *EB=NULL; char *s, mode[100]; OveralnP *F; int eb=0; if (!D1) return 1; F=vcalloc (1, sizeof (OveralnP)); string_array_upper ((D1->A)->seq_al, (D1->A)->nseq); if ( D2 && D2->A) { D1->A=mark_exon_boundaries (D1->A, D2->A); eb=1; } else if ( (s=get_string_variable ("exon_boundaries"))) { Sequence *S; Alignment *EB; EB=seq2aln(S=main_read_seq(s),NULL, 0); D1->A=mark_exon_boundaries (D1->A, EB); free_sequence (S, S->nseq); free_aln (EB); eb=1; } sprintf (F->mode, "%s", ((s=get_string_variable ("overaln_mode")))?s:"lower"); if (!strm (F->mode, "lower") && !strm (F->mode, "unalign"))printf_exit (EXIT_FAILURE,stderr,"\nERROR: unknown overal_mode in overal output [%s] [FATAL:%s]", mode, PROGRAM); if (int_variable_isset ("overaln_threshold"))F->t=get_int_variable ("overaln_threshold"); if (int_variable_isset ("overaln_target"))F->f=get_int_variable ("overaln_target"); if (int_variable_isset ("overaln_P1"))F->p1=get_int_variable ("overaln_P1"); if (int_variable_isset ("overaln_P2"))F->p2=get_int_variable ("overaln_P2"); if (int_variable_isset ("overaln_P3"))F->p3=get_int_variable ("overaln_P3"); if (int_variable_isset ("overaln_P4"))F->p4=get_int_variable ("overaln_P4"); if (eb)sprintf (F->model, "fsa2"); else sprintf (F->model, "fsa1"); D1->A=aln2clean_pw_aln (D1->A, F); //if (eb)D1->A=aln2clean_pw_aln (D1->A, mode,t, f,p1,p2,p3, "fsa2"); //else D1->A=aln2clean_pw_aln (D1->A, mode,t, f,p1,p2,p3, "fsa1"); D1->S=aln2seq(D1->A); output_clustal_aln (out_file, D1->A); } else if ( strm ( out_format, "est_prf" )) { if (!D1)return 1; output_est_prf( out_file, D1->A); } else if ( strm ( out_format, "clean_est_fasta_seq" )) { if (!D1)return 1; D1->A=clean_est(D1->A); output_fasta_seq(out_file, D1->A); } else if ( strm3 ( out_format, "msf_aln", "gcg", "msf")) { if (!D1)return 1; output_msf_aln( out_file, D1->A); } else if ( strm ( out_format, "rnalign")) { if (!D1)return 1; output_rnalign (out_file, D1->A, DST->S); } else if ( strm ( out_format, "fasta_seq") ||strm ( out_format, "list")||strm ( out_format, "file_list")) { int z; if (!D1)return 1; output_fasta_seq (out_file,D1->A); } else if (strm (out_format, "fasta_tree") ) { if (!D1)return 1; output_fasta_tree (out_file,D1->A); } else if ( strm ( out_format, "gotoh_seq")) { if (!D1)return 1; output_gotoh_seq (out_file,D1->A); } else if ( strm (out_format, "fasta_seq1")) { if (!D1)return 1; output_fasta_seq1 (out_file, D1->A); } else if ( strm2 (out_format, "pir_aln", "pir")) { if (!D1)return 1; output_pir_aln (out_file, D1->A); } else if ( strm (out_format, "pir_seq")) { if (!D1)return 1; output_pir_seq (out_file, D1->A); } else if ( strm (out_format, "gor_seq")) { if (!D1)return 1; output_gor_seq (out_file, D1->A); } else if ( strm (out_format, "pir_seq1")) { if (!D1)return 1; output_pir_seq1 (out_file, D1->A); } else if ( strm (out_format, "pw_lib_saga_aln")) { if (!D1)return 1; output_pw_lib_saga_aln (out_file, D1->A); } else if ( strm (out_format, "lib")) { if (!D1)return 1; output_lib (out_file, D1->A); } else if ( strm (out_format, "pdb_constraint_list")) { if (!D1)return 1; output_constraints (out_file, "pdb",D1->A); } else if ( strm2 (out_format, "constraint_list","tc_lib")) { if (!D1)return 1; else if (!D1->CL)output_constraints (out_file,"sim", D1->A); else if (D1->CL) vfclose ( save_constraint_list ( D1->CL, 0, (D1->CL)->ne, out_file, NULL, "ascii",(D1->CL)->S)); } else if ( strm2 (out_format, "extended_lib","extended_cosmetic")) { if (!D1)return 1; output_constraints (out_file,out_format, D1->A); } else if ( strncmp (out_format, "extended_pair", 13)==0) { if (!D1)return 1; output_constraints (out_file,out_format, D1->A); } else if ( strm (out_format, "cache_id")) { if (!D1)return 1; cache_id (D1->A); output_saga_aln (out_file, D1->A); } else if ( strm (out_format, "compress_aln")) { if (!D1)return 1; compress_aln (D1->A); output_saga_aln (out_file, D1->A); } else if (strm (out_format, "n_seq") ||strm (out_format, "nseq") ) { if (!D1)return 1; fp=vfopen ( out_file, "w"); fprintf ( fp, "%d\n", (D1->A)->nseq); vfclose (fp); } else if ( strm ( out_format, "thread_dna_on_prot_aln")) { if (!D1)return 1; D1->A=thread_dnaseq_on_prot_aln (D1->S, D2->A); output_saga_aln ( out_file, D1->A); } else if ( strm ( out_format, "tdna_fasta_seq1")) {if (!D1)return 1; D1->A=translate_dna_aln (D1->A,0); output_fasta_seq1 (out_file, D1->A); } else if ( strm ( out_format, "tdna_aln")) {if (!D1)return 1; D1->A=translate_dna_aln (D1->A,0); output_saga_aln ( out_file, D1->A); } else if ( strm ( out_format, "cdna_fasta_seq1")) {if (!D1)return 1; D1->A= gene2prot(D1->A); output_fasta_seq1 ( out_file, D1->A); } else if ( strm ( out_format, "mutate_cdna_aln")) {if (!D1)return 1; D1->A= mutate_cdna_aln ( D1->A); output_clustal_aln ( out_file, D1->A); } else if ( strm ( out_format, "tdna_sp_aln")) { if (!D1)return 1; if ( !DST) { fprintf ( stderr,"\n[You Need an evaluation File: Change the output format][FATAL:%s]\n", PROGRAM); myexit(EXIT_FAILURE); } (DST->A)=aln2number (DST->A); D1->A=translate_splice_dna_aln (D1->A, DST->A); output_saga_aln ( out_file, D1->A); } else if (out_format && out_format[0] && (strcmp ( out_format,"rna_graph_fasta")==0)) { if (!D1)return 1; sprintf ( (D1->A)->seq_al[0], "%s",(DST->S)->seq[0]); (D1->A)->nseq=0; output_fasta_seq (out_file, DST->A); } else if (strm ( out_format, "freq_mat")) { if (!D1)return 1; output_freq_mat (out_file, D1->A); } else if (strm ( out_format, "maln_pval")) {if (!D1)return 1; output_maln_pval ( out_file, D1->A); } else if ( strm ( out_format, "model_aln")) { if (!D1)return 1; output_model_aln ( out_file, D1->A); } else if (strncmp (out_format, "mult",4)==0) { if (!D1)return 1; output_mult_fasta_seq ( out_file, D1->A, atoi(out_format+4)); } else if (strm (out_format, "conservation")) { output_conservation_statistics (out_file, D1->A); } else if (strm (out_format, "len")) { if (!D1)return 1; output_statistics (out_file, D1->A, "nrl"); } else if ( strm (out_format, "name")) { if (!D1)return 1; if ( D1->A)output_statistics (out_file, D1->A, "n"); if ( D1->T) { Sequence *TS; TS=tree2seq(D1->T, NULL);print_array_char (vfopen(out_file, "w"), TS->name, TS->nseq, "\n"); } } else if ( strm (out_format, "code_name")) { char **nl=NULL; int num, n=0; Sequence *TS; FILE *lfp; if ( D1->A){n=(D1->A)->nseq, nl=(D1->A)->name;} if ( D1->T){TS=tree2seq(D1->T, NULL);nl=TS->name;n=TS->nseq;} lfp=vfopen (out_file, "w"); for ( num=0; numA); } else if ( strstr ( out_format, "pavie_age_channel")) { output_n_pavie_age_channel ( D1->S,out_file, atoi((out_format+strlen ("pavie_age_channel")))); return EXIT_SUCCESS; } else if ( strstr ( out_format, "age_matrix")) { output_age_matrix (out_file, atoi((out_format+10))); } else if ( strm ( out_format, "transitions")) { output_transitions (out_file, D1->A); } else if ( strncmp (out_format, "statistics",10)==0) { if (!D1)return 1; output_statistics (out_file, D1->A,out_format+10); } else if ( strm4 (out_format, "newick_tree","newick","binary","nh")) { if (!D1)return 1; /*D1->T=unroot_tree(D1->T);*/ vfclose (print_tree ((D1->T), out_format, vfopen ( out_file, "w"))); } else if ( strncmp (out_format, "sarsim", 6)==0) { if (!D1)return 1; compare_sar_sequence (D1->S, (D2 &&D2->S)?D2->S:D1->S, atoi(out_format+6)); return EXIT_SUCCESS; } else if ( strncmp (out_format, "sim",3)==0) { if (!D1)return 1; output_similarities (out_file, D1->A,out_format); } else if ( strncmp (out_format, "cov",3)==0) { if (!D1)return 1; output_similarities (out_file, D1->A,out_format); } else if ( strm (out_format, "stockholm_aln")) { output_stockholm_aln (out_file,D1->A, (D2)?D2->A:NULL); } else if ( strm (out_format, "pair_sim")) { if ( !D2) { fprintf ( stderr, "\n-output=pair_sim: provide aln1 via -in and aln2 via -in2 [FATAL:%s]\n", PROGRAM); myexit (EXIT_FAILURE); } output_similarities_pw (out_file, D1->A,D2->A,out_format); } else if ( strm (out_format, "matrix") || strm (out_format, "blast_matrix")) { output_blast_mat (D1->M, out_file); } else { fprintf ( stderr, "\n%s is an UNKNOWN OUTPUT FORMAT [FATAL:%s]\n",out_format, PROGRAM); myexit (EXIT_FAILURE); } //Remove the expansion if ( expanded) { free_aln (D1->A); D1->A=BUF_A; } return 0; } int is_in_format_list ( char *name) { if ( strcmp ( name, "saga_aln")==0)return 1; if ( strcmp ( name, "number_aln")==0)return 1; if ( strcmp ( name, "clustal_aln")==0)return 1; if ( strcmp ( name, "fasta_aln")==0)return 1; if ( strcmp ( name, "number_fasta")==0)return 1; if ( strcmp ( name, "fasta_seq")==0)return 1; if ( strcmp ( name, "pdb")==0)return 1; if ( strcmp ( name, "msf_aln")==0)return 1; if ( strcmp ( name, "dali_aln")==0)return 1; if ( strcmp ( name, "dali_seq")==0)return 1; if ( strcmp ( name, "barton_list_tc")==0)return 1; if ( strcmp ( name, "est_prf")==0)return 1; if ( strcmp ( name, "gotoh_aln")==0)return 1; if ( strcmp ( name, "amps_aln")==0)return 1; if ( strcmp ( name, "pir_aln")==0)return 1; if ( strcmp ( name, "pir_seq")==0)return 1; if ( strcmp ( name, "est_fasta")==0)return 1; if ( strcmp ( name, "amps_sd_scores")==0)return 1; if ( strcmp ( name, "pima_aln")==0)return 1; if ( strcmp ( name, "dialign_aln")==0)return 1; if ( strcmp ( name, "gor_seq")==0)return 1; if ( strcmp ( name, "gor_struc")==0)return 1; if ( strcmp ( name, "stockholm_aln")==0)return 1; return 0; } int is_struc_in_format_list ( char *name) { if ( strcmp ( name, "rna_number")==0)return 1; if ( strcmp ( name, "fasta_seq")==0)return 1; return 0; } char *format_name2aln_format_name (char *name) { if ( strm (name, "gcg"))sprintf (name, "msf"); else if ( strm (name, "fasta"))sprintf (name, "fasta_aln"); return name; } int is_out_format_list ( char *name) { return main_output (NULL, NULL, NULL, name, NULL); } int is_struc_out_format_list ( char *name) { return main_output (NULL, NULL, NULL, name, NULL); } /**************************************************************************************************/ /*************************************REFORMAT UTIL*************************************************/ /**************************************************************************************************/ /*************************************REFORMAT IN**************************************************/ /**************************************************************************************************/ /*******************************************************************************************/ /* */ /* */ /* READ COG FILE */ /* */ /***************************************************************************************** */ /*******************************************************************************************/ /* */ /* */ /* INPUT WEIGHTS */ /* */ /***************************************************************************************** */ Weights* get_amps_sd_scores ( char *fname) { FILE *fp; char *buf; char *buf2; int nseq; Weights *W; int a, b,e; int c; float array[20]; buf=vcalloc ( 1001, sizeof (char)); buf2=vcalloc ( 1001, sizeof (char)); fp=vfopen ( fname, "r"); set_fp_id ( fp, "Index"); buf=fgets ( buf, 1000, fp); fscanf ( fp, "%s", buf2); nseq=0; while ( isalnum(buf2[0]) && !isalpha(buf2[0])) { nseq++; buf=fgets ( buf, 1000, fp); fscanf ( fp, "%s", buf2); } vfclose ( fp); W=declare_weights (nseq); fp=vfopen ( fname, "r"); set_fp_id ( fp, "Index"); buf=fgets ( buf, 1000, fp); fscanf ( fp, "%s", buf2); a=0; while ( isalnum(buf2[0]) && !isalpha(buf2[0])) { fp=set_fp_after_char (fp, '>'); fscanf ( fp, "%s",W->seq_name[a]); buf=fgets ( buf, 1000, fp); fscanf ( fp, "%s", buf2); a++; } buf=fgets ( buf, 1000, fp); c=1; while ( c!=0) { for ( e=0; e< 16; e++) { c=fscanf ( fp, "%f", &array[e]); } fscanf ( fp, "\n"); if ( c!=0) { a=(int)array[0]-1; b=(int)array[1]-1; W->PW_ID[b][a]=W->PW_ID[a][b]=array[9]; W->PW_SD[b][a]=W->PW_SD[a][b]=array[14]; } } vfclose ( fp); sprintf ( W->comments, "SD WEIGHTS GENERATED WITH THE PROGRAM AMPS IN PAIRWISE MODE"); vfree ( buf); return W; } Weights *read_seq_weight (char **name, int nseq, char* seq_weight) { int a, p; Weights *W; float w; FILE *fp; char line[LONG_STRING]; char sname[MAXNAMES]; /*Read sequence weights: * comment name1 weight1 ..... NOTE: weights must be between 0 and 1; sequences not in S do not get any weight sequences in S but not in file get a weight of 1 */ if ( !is_single_seq_weight_file (seq_weight)) { fprintf ( stderr, "\nERROR: File %s is not in Format SINGLE_SEQ_WEIGHT_FORMAT_01 [FATA:%s]", seq_weight,PROGRAM); myexit (EXIT_FAILURE); return NULL; } else { W=declare_weights(nseq); for ( a=0; a< nseq; a++) { sprintf ( W->seq_name[a], "%s", name[a]); W->SEQ_W[a]=1; } sprintf ( W->mode, "%s", seq_weight); fp=vfopen (seq_weight, "r"); while ( fgets( line,LONG_STRING-1, fp)) { if ( line[0]=='*' ||line[0]=='#' || isblanc(line)); else { if (sscanf(line, "%s %f", sname, &w)!=2)continue; if ( (p=name_is_in_list ( sname, W->seq_name, nseq, MAXNAMES-1))!=-1) { W->SEQ_W[p]=w; } } } vfclose (fp); return W; } } /*******************************************************************************************/ /* */ /* */ /* INPUT MISC */ /* */ /***************************************************************************************** */ char *** read_rename_file ( char *fname, int code) { int n; FILE *fp; char ***convert=NULL; convert=declare_arrayN(3, sizeof (char),count_n_line_in_file(fname) +1,2,MAXNAMES+1); fp=vfopen (fname, "r"); n=0; if ( code==CODE) while ( fscanf ( fp, "%s %s\n", convert[n][0], convert[n][1])==2)n++; else if (code==DECODE)while ( fscanf ( fp, "%s %s\n", convert[n][1], convert[n][0])==2)n++; vfclose (fp); return convert; } void get_barton_list_tc_seq ( char *in_file) { FILE *fp, *fp_make, *fp_length, *fp_long; FILE *fp_small[9]; static char *buf; int len_buf=10000; char name[100]; char pwd[100]; int a,c,nseq; int k=0; int *length; int longest=0; c=0; length=vcalloc ( 1000, sizeof(int)); if ( buf==NULL)buf=vcalloc ( len_buf, sizeof (char)); fp=vfopen (in_file, "r"); fp_long=vfopen ( "barton_seq_list_large", "w"); fp_make=vfopen ( "make_dir", "w"); fp_length=vfopen ( "barton_length", "w"); for ( a=0; a< 9; a++) { sprintf ( name, "barton_nseq%d",a); fp_small[a]=vfopen ( name, "w"); } get_pwd (pwd); while ( c!=EOF) {a=0; while ( (c=fgetc(fp))!='#'); while ( (c=fgetc(fp))=='#'); ungetc ( c, fp); while ( (c=fgetc(fp))!='#')buf[a++]=c; buf[a]='\0'; sprintf ( name, "%s", buf); while ( (c=fgetc(fp))=='#'); if ( c!=EOF) { a=0; while ( (c=fgetc(fp))!='#' && c!=EOF) { buf[a++]=c; if (a==len_buf) { len_buf+=10000; buf=vrealloc ( buf, len_buf*sizeof (char)); } } buf[a]='\0'; if (c!=EOF) { nseq=process_barton_entry ( buf,name); length[nseq]++; longest=(longest') { a=get_string_line (a,2, buf, buf2); while ((c=buf[a++])!='*') if (isalnum (c)|| c=='.' || c=='-') clen++; max_len_seq=(clen> max_len_seq)?clen: max_len_seq; min_len_seq=(clen< min_len_seq)?clen: min_len_seq; nseq++; clen=0; } if ( buf[a]!='\0')a++; } LS=declare_sequence ( min_len_seq, max_len_seq, nseq); LS->nseq=nseq; for (a=0, current=0; current< nseq; current++) { a=get_string_line ( a, 1, buf, buf2); sscanf ( buf2, ">P1;%s", LS->name[current]); a=get_string_line ( a, 1, buf, buf2); l=strlen ( buf2); buf2[l-1]='\0'; sprintf ( LS->seq_comment[current], buf2); p=0; while ( (c=buf[a++])!='*') { if (isalpha (c)) LS->seq[current][p++]=tolower (c); else if ( isgraph(c)) LS->seq[current][p++]=(c); } a++; } LA=declare_Alignment(LS); seq2aln ( LS, LA,rm_gap); output_fasta_seq (fname,LA); output_pir_check (com_name,LA->nseq, LA->seq_comment); free_Alignment ( LA); free_sequence ( LS, nseq); return nseq; } Structure *read_rna_struc_number (Alignment *A,char *fname) { FILE *fp; int a; char x,y; float f; Sequence *SA; Structure *ST; int first, last; SA=declare_sequence ( A->len_aln, A->len_aln, 1); SA->len[0]=A->len[0]; for ( a=0; a< SA->len[0]; a++) SA->seq[0][a]='.'; ST=declare_rna_structure_num (SA); ST->S=SA; fp=vfopen ( fname, "r"); fscanf ( fp, "%c\n%d\n",&x, &(ST)->tot_list); for ( a=0; a<(ST)->tot_list; a++) { fscanf ( fp, "%d %d %d %c %c %f\n", &(ST)->list[a][0],&(ST)->list[a][1],&(ST)->list[a][2], &x, &y, &f); (ST)->list[a][0]--; (ST)->list[a][1]--; (ST)->list[a][2]--; if ( a==0) { (ST)->stem[0][0]=(ST)->list[a][0]; (ST)->stem[0][1]=a; } else if ( (ST)->stem[(ST)->tot_stem][0]==(ST)->list[a][0]); else if ( (ST)->stem[(ST)->tot_stem][0]!=(ST)->list[a][0]) { (ST)->stem[(ST)->tot_stem][2]=a-1; (ST)->tot_stem++; (ST)->stem[(ST)->tot_stem][0]=(ST)->list[a][0]; (ST)->stem[(ST)->tot_stem][1]=a; } SA->seq[0][(ST)->list[a][1]]='-'; SA->seq[0][(ST)->list[a][2]]='-'; } (ST)->stem[(ST)->tot_stem][2]=a-1; (ST)->tot_stem++; for ( a=0; a< (ST)->tot_stem; a++) { first=(ST)->stem[a][1]; last=(ST)->stem[a][2]; SA->seq[0][(ST)->list[first][1]]='>'; SA->seq[0][(ST)->list[first][2]]='<'; SA->seq[0][(ST)->list[last][1]]='>'; SA->seq[0][(ST)->list[last][2]]='<'; } return ST; } Structure * declare_rna_structure_num (Sequence *SA) { Structure *ST; ST=vcalloc ( 1, sizeof ( Structure)); ST->list=declare_int ( SA->len[0], 3); ST->stem=declare_int ( SA->len[0], 3); return ST; } char ** read_lib_list (char *name, int *n) { char **lines; char **list; int a, b, l; lines=file2lines (name); l=atoi (lines[0]); list=vcalloc (l, sizeof (char*)); for ( n[0]=0,a=1; a .... Groups must NOT be overlaping list[group_index][0]="number of sequences" list[group_index][1]="group name" list[group_index][2...N]="sequence" */ FILE *fp; char *buf; char ***list; int a, c, l; l=measure_longest_line_in_file (file)+1; buf=vcalloc (l, sizeof (char)); list=vcalloc ( count_n_line_in_file (file )+1, sizeof (char**)); fp=vfopen (file, "r"); a=0; while ((c=fgetc(fp))!=EOF) { buf=fgets (buf,l-1, fp); if ( c=='>')list[a++]=string2list (buf); } vfclose (fp); vfree (buf); return list; } static Sequence* get_pdb_sequence_from_field (char *fname, char *field); Sequence* get_pdb_sequence (char *fname) { Sequence *S; if ( (S=get_pdb_sequence_from_field(fname, "SEQRES"))!=NULL); else if ( (S=get_pdb_sequence_from_field(fname, "ATOM"))!=NULL) { add_warning (stderr,"Warning: Read Sequence from ATOM field in %s [%s:WARNING]", fname, PROGRAM); } else { add_warning ( stderr, "\nWARNING: failed to extract sequence from %s [%s:WARNING]\n", fname, PROGRAM); S=NULL; } return S; } static Sequence* get_pdb_sequence_from_field (char *fname, char *field) { char *tp_name; char *command; char *pdbid; Sequence *S; command=vcalloc ( LONG_STRING, sizeof (char)); tp_name=vtmpnam (NULL); sprintf ( command, "extract_from_pdb -seq_field %s -chain FIRST -infile \'%s\' -mode fasta > %s", field, check_file_exists(fname), tp_name); if ( getenv4debug ("DEBUG_EXTRACT_FROM_PDB"))fprintf ( stderr, "\n[DEBUG_EXTRACT_FROM_PDB:get_pdb_seq] %s\n", command); my_system ( command); S=get_fasta_sequence ( tp_name, NULL); if (S==NULL)return NULL; if ( (pdbid=get_pdb_id (fname))){sprintf ( S->name[0], "%s",pdbid);vfree (pdbid);} S->nseq=1; sprintf ( S->file[0], "%s", fname); S->max_len=S->min_len=S->len[0]; if ( S->len[0]==0) { free_sequence (S, -1); S=NULL; } vremove ( tp_name); vfree ( command); return S; } char * get_pdb_file ( char *fname) { char *file; int a, c; FILE *fp; a=0; file=vcalloc ( sizeof (char),count_n_char_in_file ( fname)+1); fp=vfopen ( fname, "r"); while ( (c=fgetc(fp))!=EOF)file[a++]=c; file[a]='\0'; return file; } Sequence* get_struc_gor ( char *fname) { int nseq, min_len, max_len; int a, c; int len; char name[STRING]; FILE *fp; Sequence *S; min_len=max_len=-1; fp=vfopen ( fname, "r"); nseq=0; while ( (c=fgetc(fp))!=EOF) { if ( c!='!'); else { nseq++; fscanf ( fp, "%s %d", name, &len); if (min_len==-1)min_len=max_len=len; else { min_len=(len>min_len)?min_len:len; max_len=(len>max_len)?len:max_len; } } } vfclose (fp); S=declare_sequence ( min_len, max_len+1,nseq); S->nseq=0; fp=vfopen (fname,"r"); while ( (c=fgetc(fp))!=EOF) { if ( c!='!'); else { fscanf ( fp, "%s %d\n",S->name[S->nseq], &(S->len[S->nseq])); while ( (c=fgetc(fp))!='\n'); for ( a=0; alen[S->nseq]; a++) fscanf ( fp, " %*c %c %*f %*f %*f\n",&(S->seq[S->nseq][a])); S->seq[S->nseq][a]='\0'; while ( (c=fgetc(fp))!='!' && c!=EOF); ungetc (c, fp); S->nseq++; } } vfclose (fp); return S; } Sequence* get_sequence_dali (char *fname) { Sequence *LS; FILE *fp; int c; char name[100]; int clen=0; int current=0; int p=0; int max_len_seq=0; int min_len_seq=999999; int nseq=0; if ((fp=vfopen (fname,"r"))==NULL) {printf ( "\nCOULDN'T OPEN %s",fname); myexit(EXIT_FAILURE); } c=fgetc(fp); while (c!=EOF) { if (isdigit(c)) { ungetc(c, fp); fscanf (fp, "%s",name); while (!isdigit(c=fgetc(fp)) && c!=EOF) if (isalnum (c) || c=='.' || c=='-') clen++; max_len_seq=(clen> max_len_seq)?clen: max_len_seq; min_len_seq=(clen< min_len_seq)?clen: min_len_seq; nseq++; clen=0; } else c=fgetc (fp); } vfclose (fp); LS=declare_sequence ( min_len_seq, max_len_seq+1,nseq); LS->nseq=nseq; fp=vfopen (fname,"r"); current=0; c=fgetc(fp); while (c!=EOF) { if (isdigit(c)) { ungetc(c, fp); fscanf_seq_name (fp, LS->name[current]); p=0; while (!isdigit(c=fgetc(fp)) && c!=EOF) { if (isalpha (c)) LS->seq[current][p++]=tolower (c); else if ( c=='.') LS->seq[current][p++]='-'; else if ( c=='-') LS->seq[current][p++]='-'; } LS->seq[current][p]='\0'; LS->len[current]=strlen ( LS->seq[current]); current++; } else c=fgetc ( fp); } vfclose (fp); return LS; } Sequence* get_dialign_sequence (char *fname) { Sequence *LS; FILE *fp; int c; char name[10000]; int clen=0; int current=0; int p=0; int max_len_seq=0; int min_len_seq=999999; int nseq=0, l=0; char *buf; buf=vcalloc ( 1000, sizeof (char)); if ((fp=vfopen (fname,"r"))==NULL) {printf ( "\nCOULDN'T OPEN %s",fname); myexit(EXIT_FAILURE); } c=fgetc(fp); while (c!=EOF) { if (c=='>') {fscanf (fp, "%s",name); buf=fgets ( buf, 1000, fp); while ((c=fgetc(fp))!='>' && c!=EOF && c!=' ' && c!='\t') if (isalnum (c)|| is_gap(c)) clen++; max_len_seq=(clen> max_len_seq)?clen: max_len_seq; min_len_seq=(clen< min_len_seq)?clen: min_len_seq; nseq++; clen=0; } else c=fgetc (fp); } vfclose (fp); LS=declare_sequence ( min_len_seq, max_len_seq, nseq); LS->nseq=nseq; fp=vfopen (fname,"r"); current=0; c=fgetc(fp); while (c!=EOF) { if (c=='>') { fscanf_seq_name (fp, LS->name[current]); l=strlen ( LS->name[current]); if ( LS->name[current][l-1]==','||LS->name[current][l-1]==',')LS->name[current][l-1]='\0'; buf=fgets ( buf, 1000, fp); p=0; while ((c=fgetc(fp))!='>' && c!=EOF && c!=EOF && c!=' ' && c!='\t') if (isalpha (c)) LS->seq[current][p++]=tolower (c); else if ( isgraph(c)) LS->seq[current][p++]=(c); LS->seq[current][p]='\0'; LS->len[current]=strlen ( LS->seq[current]); current++; } else c=fgetc ( fp); } vfclose (fp); return LS; } Sequence* get_pima_sequence (char *fname) { Sequence *LS; FILE *fp; int c; char name[10000]; int clen=0; int current=0; int p=0; int max_len_seq=0; int min_len_seq=999999; int nseq=0, l=0, len=0; char *buf, *buf2; char prefix[1000]; sprintf ( prefix, "%s",fname); buf=strstr(prefix, "-"); buf[0]='\0'; len=strlen (prefix); buf=vcalloc ( 1000, sizeof (char)); if ((fp=vfopen (fname,"r"))==NULL) {printf ( "\nCOULDN'T OPEN %s",fname); myexit(EXIT_FAILURE); } c=fgetc(fp); while (c!=EOF) { if (c=='>') { fscanf_seq_name (fp,name); if ( strlen(name)>=len && strncmp ( name, prefix, len)==0) { c=fgetc(fp); } else { buf=fgets ( buf, 1000, fp); while ((c=fgetc(fp))!='>' && c!=EOF) if (isalnum (c)|| is_gap(c)) clen++; max_len_seq=(clen> max_len_seq)?clen: max_len_seq; min_len_seq=(clen< min_len_seq)?clen: min_len_seq; nseq++; clen=0; } } else c=fgetc (fp); } vfclose (fp); LS=declare_sequence ( min_len_seq, max_len_seq, nseq); LS->nseq=nseq; fp=vfopen (fname,"r"); current=0; c=fgetc(fp); while (c!=EOF) { if (c=='>') { fscanf_seq_name (fp,LS->name[current]); if ( strlen(LS->name[current])>=len && strncmp ( LS->name[current], prefix, len)==0) c=fgetc (fp); else { buf2=strstr (LS->name[current], "."); if ( buf2!=NULL) buf2[0]='\0'; l=strlen ( LS->name[current]); if ( LS->name[current][l-1]==','||LS->name[current][l-1]==',')LS->name[current][l-1]='\0'; buf=fgets ( buf, 1000, fp); p=0; while ((c=fgetc(fp))!='>' && c!=EOF) if (isalpha (c)) LS->seq[current][p++]=tolower (c); else if ( isgraph(c)) LS->seq[current][p++]=(c); LS->seq[current][p]='\0'; LS->len[current]=strlen ( LS->seq[current]); current++; } } else c=fgetc ( fp); } vfclose (fp); return LS; } Sequence* perl_reformat2fasta (char *perl_command, char *fname) { char command[1000]; char *file; file=vtmpnam (NULL); check_program_is_installed ( perl_command,"", perl_command,EMAIL,IS_FATAL); sprintf ( command, "%s %s > %s", perl_command, fname, file); my_system ( command); return get_fasta_sequence (file, NULL); } Sequence* get_fasta_sequence_num (char *fname, char *comment_out) { Sequence *LS; char *buffer; FILE *fp; int a; int c; char *name; int clen=0; int current=0; int p=0; int max; int max_len_seq=0; int min_len_seq=0; int nseq=0, l=0; int *sub; buffer=vcalloc (1000, sizeof (char)); name=vcalloc ( 100, sizeof (char)); nseq=count_n_char_x_in_file(fname, '>'); min_len_seq=max=count_n_char_in_file(fname); sub=vcalloc (max+1, sizeof (int)); fp=vfopen (fname,"r"); c=fgetc(fp); while (c!=EOF) { if (c=='>') { fscanf_seq_name (fp,name); while ((c=fgetc(fp))!='\n' && c!=EOF); while ((c=fgetc(fp))!='>' && c!=EOF) if (isalnum (c)|| is_gap(c)) clen++; max_len_seq=(clen> max_len_seq)?clen: max_len_seq; min_len_seq=(clen< min_len_seq)?clen: min_len_seq; clen=0; } else c=fgetc (fp); } vfclose (fp); LS=declare_sequence ( min_len_seq, max_len_seq,nseq); LS->nseq=nseq; fp=vfopen (fname,"r"); current=0; c=fgetc(fp); while (c!=EOF) { if (c=='>') { fscanf_seq_name (fp,LS->name[current]); l=strlen ( LS->name[current]); if ( LS->name[current][l-1]==','||LS->name[current][l-1]==';')LS->name[current][l-1]='\0'; LS->name[current]=translate_name ( LS->name[current]); a=0; while ((c=fgetc(fp))!='\n' && c!=EOF && a<(COMMENT_SIZE-1))LS->seq_comment[current][a++]=c; LS->seq_comment[current][a]='\0'; p=0; while ((c=fgetc(fp))!='>' && c!=EOF) { if (isalnum (c)) LS->seq[current][p++]=c; else if (is_gap(c)) LS->seq[current][p++]=c; } LS->seq[current][p]='\0'; LS->len[current]=strlen ( LS->seq[current]); current++; } else c=fgetc ( fp); } vfclose (fp); vfree (sub); vfree (name); vfree (buffer); return LS; } Sequence *get_tree_file_list ( char *fname) { char ***list; char *tmp; int a; FILE *fp; tmp=vtmpnam (NULL); list=file2list (fname, "\n"); fp=vfopen (tmp, "w"); a=0; while (list[a] && !isspace(list[a][1][0])) { char *s; s=file2string (list[a][1]); fprintf ( fp, ">%s\n%s\n", list[a][1], (s)?s:""); a++; } vfclose (fp); free_arrayN((void ***)list, 3); return get_fasta_tree (tmp, NULL); } Sequence *get_file_list ( char *fname) { char ***list; char *tmp; int a; FILE *fp; tmp=vtmpnam (NULL); list=file2list (fname, "\n"); fp=vfopen (tmp, "w"); a=0; while (list[a] && !isspace(list[a][1][0])) { fprintf ( fp, ">%s\n", list[a][1]); a++; } vfclose (fp); free_arrayN((void ***)list, 3); return get_fasta_sequence (tmp, NULL); } Sequence*get_fasta_tree (char *fname, char *comment_out) { Sequence *LS; char *buffer; FILE *fp; int a; int c; char *name; int clen=0; int current=0; int p=0; int max; int max_len_seq=0; int min_len_seq=0; int nseq=0, l=0; int *sub; buffer=vcalloc (1000, sizeof (char)); name=vcalloc ( 100, sizeof (char)); nseq=count_n_char_x_in_file(fname, '>'); min_len_seq=max=count_n_char_in_file(fname); sub=vcalloc (max+1, sizeof (int)); fp=vfopen (fname,"r"); c=fgetc(fp); while (c!=EOF) { if (c=='>') { fscanf_seq_name (fp,name); while ((c=fgetc(fp))!='\n' && c!=EOF); while ((c=fgetc(fp))!='>' && c!=EOF) if (isgraph(c)) clen++; max_len_seq=(clen> max_len_seq)?clen: max_len_seq; min_len_seq=(clen< min_len_seq)?clen: min_len_seq; clen=0; } else c=fgetc (fp); } vfclose (fp); LS=declare_sequence ( min_len_seq, max_len_seq,nseq); LS->nseq=nseq; fp=vfopen (fname,"r"); current=0; c=fgetc(fp); while (c!=EOF) { if (c=='>') { fscanf_seq_name (fp,LS->name[current]); l=strlen ( LS->name[current]); if ( LS->name[current][l-1]==','||LS->name[current][l-1]==';')LS->name[current][l-1]='\0'; LS->name[current]=translate_name ( LS->name[current]); a=0; while ((c=fgetc(fp))!='\n' && c!=EOF && a<(COMMENT_SIZE-1))LS->seq_comment[current][a++]=c; LS->seq_comment[current][a]='\0'; p=0; while ((c=fgetc(fp))!='>' && c!=EOF) { LS->seq[current][p++]=c; } LS->seq[current][p]='\0'; LS->len[current]=strlen ( LS->seq[current]); current++; } else c=fgetc ( fp); } vfclose (fp); vfree (sub); vfree (name); vfree (buffer); return LS; } Sequence* get_fasta_sequence_raw (char *fname, char *comment_out) { Sequence *LS; char *buffer; FILE *fp; int a; int c; char *name; int clen=0; int current=0; int p=0; int max; int max_len_seq=0; int min_len_seq=0; int nseq=0, l=0; int *sub; buffer=vcalloc (1000, sizeof (char)); name=vcalloc ( 100, sizeof (char)); nseq=count_n_char_x_in_file(fname, '>'); min_len_seq=max=count_n_char_in_file(fname); sub=vcalloc (max+1, sizeof (int)); fp=vfopen (fname,"r"); c=fgetc(fp); while (c!=EOF) { if (c=='>') { fscanf_seq_name (fp,name); while ((c=fgetc(fp))!='\n' && c!=EOF); while ((c=fgetc(fp))!='>' && c!=EOF) if (isgraph(c)) clen++; max_len_seq=(clen> max_len_seq)?clen: max_len_seq; min_len_seq=(clen< min_len_seq)?clen: min_len_seq; clen=0; } else c=fgetc (fp); } vfclose (fp); LS=declare_sequence ( min_len_seq, max_len_seq,nseq); LS->nseq=nseq; fp=vfopen (fname,"r"); current=0; c=fgetc(fp); while (c!=EOF) { if (c=='>') { fscanf_seq_name (fp,LS->name[current]); l=strlen ( LS->name[current]); if ( LS->name[current][l-1]==','||LS->name[current][l-1]==';')LS->name[current][l-1]='\0'; LS->name[current]=translate_name ( LS->name[current]); a=0; while ((c=fgetc(fp))!='\n' && c!=EOF && a<(COMMENT_SIZE-1))LS->seq_comment[current][a++]=c; LS->seq_comment[current][a]='\0'; p=0; while ((c=fgetc(fp))!='>' && c!=EOF) { //if (c<'A')c+='z'; if (c!='\n')LS->seq[current][p++]=c; } LS->seq[current][p]='\0'; LS->len[current]=strlen ( LS->seq[current]); current++; } else c=fgetc ( fp); } vfclose (fp); vfree (sub); vfree (name); vfree (buffer); return LS; } Sequence* get_fasta_sequence (char *fname, char *comment_out) { Sequence *LS; Sequence *pdb_S; int a; char *pdb_name; char *buffer; FILE *fp; int c; char *name; int clen=0; int current=0; int p=0; int max; int max_len_seq=0; int min_len_seq=0; int nseq=0, l=0; char *sub; int disk=0; int coor=0; char *test; buffer=vcalloc (1000, sizeof (char)); name=vcalloc ( 10000, sizeof (char)); nseq=count_n_char_x_in_file(fname, '>'); if (disk==1 || get_int_variable ("use_disk") || getenv ("SEQ_ON_DISK_4_TCOFFEE")){disk=1;} if ( nseq==0) { vfree (buffer); vfree (name); return NULL; } min_len_seq=max=count_n_char_in_file(fname); sub=vcalloc (max+1, sizeof (char)); fp=vfopen (fname,"r"); nseq=0; c=fgetc(fp); while (c!=EOF) { if (c=='>') { nseq++; fscanf_seq_name (fp,name); while ((c=fgetc(fp))!='\n' && c!=EOF); while ((c=fgetc(fp))!='>' && c!=EOF) { if (isalnum (c)|| is_gap(c)) sub[clen++]=c; } if (strm (sub, "PDB")) { pdb_name=get_pdb_struc(name,0, 0); pdb_S=get_pdb_sequence (pdb_name); if (pdb_S) { clen=strlen( pdb_S->seq[0]); free_sequence ( pdb_S,1); } else clen=0; } max_len_seq=(clen> max_len_seq)?clen: max_len_seq; min_len_seq=(clen< min_len_seq)?clen: min_len_seq; clen=0; } else c=fgetc (fp); } vfclose (fp); if ( disk==0) LS=declare_sequence ( min_len_seq, max_len_seq,nseq); else { LS=declare_sequence (0,0,nseq); for (a=0; aseq[a]=NULL; } LS->nseq=nseq; fp=vfopen (fname,"r"); current=0; c=fgetc(fp);coor++; while (c!=EOF) { if (c=='>') { coor+=fscanf_seq_name (fp, LS->name[current]); l=strlen ( LS->name[current]); if ( LS->name[current][l-1]==','||LS->name[current][l-1]==';')LS->name[current][l-1]='\0'; LS->name[current]=translate_name ( LS->name[current]); a=0; while ((c=fgetc(fp))!='\n' && c!=EOF && a<(COMMENT_SIZE-1)){LS->seq_comment[current][a++]=c;coor++;} coor++; LS->seq_comment[current][a]='\0'; p=0; while ((c=fgetc(fp))!='>' && c!=EOF) { coor++; if (!isspace(c)) { if (p==0)LS->dc[current][0]=coor; if (disk==0)LS->seq[current][p++]=c; else p++; } LS->dc[current][1]=coor; } coor++; if ( disk==0)LS->seq[current][p]='\0'; if (LS->seq[current] && strm (LS->seq[current], "PDB")) { pdb_name=get_pdb_struc(LS->name[current],0, 0); pdb_S=get_pdb_sequence (pdb_name); if (pdb_S) { sprintf ( LS->seq[current], "%s", pdb_S->seq[0]); clen=strlen( pdb_S->seq[0]); free_sequence ( pdb_S, 1); } else { add_warning (stderr, "WARNING: Could not fetch PDB file: %s", pdb_name); } } LS->len[current]=p; current++; } else { c=fgetc ( fp); coor++; } } vfclose (fp); vfree (sub); vfree (name); vfree (buffer); //LS=clean_sequence (LS); return LS; } Sequence* get_sub_fasta_sequence (char *fname, char *comment_out) { Sequence *LS; FILE *fp; int c; char name[100]; int clen=0; int current=0; int p=0; int max; int max_len_seq=0; int min_len_seq=0; int nseq=0, l=0; char *buf; int *sub; nseq=count_n_char_x_in_file(fname, '>'); min_len_seq=max=count_n_char_in_file(fname); sub=vcalloc (max+1, sizeof (int)); buf=vcalloc ( max+1, sizeof (char)); fp=vfopen (fname,"r"); c=fgetc(fp); while (c!=EOF) { if (c=='>') { fscanf_seq_name (fp,name); while ((c=fgetc(fp))!='\n' && c!=EOF); buf=fgets ( buf,max, fp); while ((c=fgetc(fp))!='>' && c!=EOF) if (isalnum (c)|| is_gap(c)) clen++; max_len_seq=(clen> max_len_seq)?clen: max_len_seq; min_len_seq=(clen< min_len_seq)?clen: min_len_seq; clen=0; } else c=fgetc (fp); } vfclose (fp); LS=declare_sequence ( min_len_seq, max_len_seq,nseq); LS->nseq=nseq; fp=vfopen (fname,"r"); current=0; c=fgetc(fp); while (c!=EOF) { if (c=='>') { fscanf_seq_name (fp,LS->name[current]); l=strlen ( LS->name[current]); if ( LS->name[current][l-1]==','||LS->name[current][l-1]==';')LS->name[current][l-1]='\0'; LS->name[current]=translate_name ( LS->name[current]); while ((c=fgetc(fp))!='\n' && c!=EOF); p=0; while ((c=fgetc(fp))!='>' && c!=EOF) { if (isalpha (c)) LS->seq[current][p++]=tolower (c); else if (is_gap(c)) LS->seq[current][p++]=(c); } LS->seq[current][p]='\0'; LS->len[current]=strlen ( LS->seq[current]); current++; } else c=fgetc ( fp); } vfclose (fp); vfree (sub); return LS; } Sequence* get_pir_sequence (char *fname, char *comment_out) { Sequence *LS; FILE *fp; int c; char name[100]; int clen=0; int current=0; int p=0; int max_len_seq=0; int min_len_seq=999999; int nseq=0, l=0; char *buf; buf=vcalloc ( 1000, sizeof (char)); if ((fp=vfopen (fname,"r"))==NULL) {printf ( "\nCOULDN'T OPEN %s",fname); myexit(EXIT_FAILURE); } c=fgetc(fp); while (c!=EOF) { if (c=='>') { if ( (c=fgetc(fp))=='P')while ( (c=fgetc(fp))!=';'); else ungetc ( c, fp); fscanf_seq_name (fp,name); buf=fgets ( buf, 1000, fp); while ((c=fgetc(fp))!='>' && c!=EOF) if (isalnum (c)|| is_gap(c)) clen++; max_len_seq=(clen> max_len_seq)?clen: max_len_seq; min_len_seq=(clen< min_len_seq)?clen: min_len_seq; nseq++; clen=0; } else c=fgetc (fp); } vfclose (fp); LS=declare_sequence ( min_len_seq, max_len_seq,nseq); LS->nseq=nseq; fp=vfopen (fname,"r"); current=0; c=fgetc(fp); while (c!=EOF) { if (c=='>') { if ( (c=fgetc(fp))=='P')while ( (c=fgetc(fp))!=';'); else ungetc ( c, fp); fscanf_seq_name (fp,LS->name[current]); l=strlen ( LS->name[current]); if ( LS->name[current][l-1]==','||LS->name[current][l-1]==',')LS->name[current][l-1]='\0'; LS->name[current]=translate_name ( LS->name[current]); buf=fgets ( buf, 1000, fp); LS->seq_comment[current]=fgets ( LS->seq_comment[current],COMMENT_SIZE-1, fp); LS->seq_comment[current][strlen(LS->seq_comment[current])-1]='\0'; p=0; while ((c=fgetc(fp))!='>' && c!=EOF) if (isalpha (c)) LS->seq[current][p++]=tolower (c); else if ( !isspace(c) && c!='*') LS->seq[current][p++]=(c); LS->seq[current][p]='\0'; LS->len[current]=strlen ( LS->seq[current]); current++; } else c=fgetc ( fp); } vfclose (fp); if (comment_out!=NULL) output_pir_check ( comment_out,LS->nseq, LS->seq_comment); return LS; } Sequence* get_gor_sequence (char *fname, char *comment_out) { Sequence *LS; FILE *fp; int c; char name[100]; int clen=0; int current=0; int p=0; int max_len_seq=0; int min_len_seq=99999; int nseq=0; char *buf; buf=vcalloc ( 1000, sizeof (char)); if ((fp=vfopen (fname,"r"))==NULL) {printf ( "\nCOULDN'T OPEN %s",fname); myexit(EXIT_FAILURE); } c=fgetc(fp); while (c!=EOF) { if (c=='!') { fscanf_seq_name (fp,name); buf=fgets ( buf, 1000, fp); while ((c=fgetc(fp))!='!' && c!=EOF) if (isalnum (c)|| is_gap(c)) clen++; max_len_seq=(clen> max_len_seq)?clen: max_len_seq; min_len_seq=(clen< min_len_seq)?clen: min_len_seq; nseq++; clen=0; } else c=fgetc (fp); } vfclose (fp); LS=declare_sequence ( min_len_seq, max_len_seq,nseq); LS->nseq=nseq; fp=vfopen (fname,"r"); current=0; c=fgetc(fp); while (c!=EOF) { if (c=='!') { fscanf_seq_name (fp,LS->name[current]); LS->name[current]=translate_name ( LS->name[current]); buf=fgets ( buf, 1000, fp); p=0; while ((c=fgetc(fp))!='!' && c!=EOF) if (isalnum (c)|| is_gap(c)) LS->seq[current][p++]=tolower (c); LS->seq[current][p]='\0'; LS->len[current]=strlen ( LS->seq[current]); current++; } else c=fgetc ( fp); } vfclose (fp); return LS; } Sequence* get_swissprot_sequence (char *fname, char *comment_out) { Sequence *LS; FILE *fp; int c; char *buf; int nseq=0; int len, max_len_seq=0, min_len_seq=0; if ( !check_file_exists(fname)) {printf ( "\nCOULDN'T OPEN %s",fname); myexit(EXIT_FAILURE); } buf=vcalloc (LONG_STRING+1, sizeof (char)); fp=NULL; while ( (fp=find_token_in_file(fname,fp,"\nSQ"))) { nseq++; fgets (buf, LONG_STRING, fp); len=0; while ((c=fgetc(fp))!='/')if(isalpha(c))len++; if ( max_len_seq==0)max_len_seq=min_len_seq=len; else { max_len_seq=MAX(len, max_len_seq); min_len_seq=MIN(len, min_len_seq); } } LS=declare_sequence ( min_len_seq, max_len_seq,nseq); LS->nseq=0; fp=NULL; while ( (fp=find_token_in_file(fname,fp,"\nID"))) { fscanf_seq_name (fp, LS->name[LS->nseq]); fp=find_token_in_file(fname,fp,"\nSQ"); fgets (buf, LONG_STRING, fp); while ((c=fgetc(fp))!='/')if (isalpha(c))LS->seq[LS->nseq][LS->len[LS->nseq]++]=c; LS->seq[LS->nseq][LS->len[LS->nseq]]='\0'; LS->nseq++; } return LS; } int fscanf_seq_name ( FILE *fp, char *sname) { static char *name; int r; if ( !sname) return 0; if ( !name)name=vcalloc ( 10000, sizeof (char)); fscanf (fp, "%s", name); r=strlen (name); if ( strlen (name)>MAXNAMES) add_warning (stderr, "\nWARNING: Seq Name Too long: [%s]. Truncated to %d", name, MAXNAMES); name[MAXNAMES]='\0'; sprintf ( sname, "%s", name); return r; } /*******************************************************************************************/ /* */ /* */ /* INPUT ALN */ /* */ /***************************************************************************************** */ void undump_msa ( Alignment *A, char *tmp) { FILE *fp; int m; char *buf; int index; if ( !A || !tmp || !check_file_exists (tmp))return; m=measure_longest_line_in_file (tmp ); A=realloc_aln2 ( A,A->max_n_seq,m+1); buf=vcalloc (m+1, sizeof (char)); fp=vfopen (tmp, "r"); while (fscanf (fp, "%d %s\n", &index, buf)==2) { sprintf ( A->seq_al[index], "%s", buf); } vfclose (fp); vfree (buf); } void dump_msa ( char *file,Alignment *A, int nseq, int *lseq) { FILE *fp; int a; fp=vfopen (file, "w"); for (a=0; aseq_al[lseq[a]]); vfclose (fp); } void read_aln (char *file_name, Alignment *A) { char *tmp_name; Sequence *S; tmp_name=vtmpnam (NULL); if (printf_system ( "clustalw_aln2fasta_aln.pl %s > %s",file_name, tmp_name)!=EXIT_SUCCESS) { printf_exit ( EXIT_FAILURE, stderr, "Could Not Read File %s [FATAL:%s]\n", file_name, PROGRAM); } else { S=get_fasta_sequence ( tmp_name,NULL); A=seq2aln (S, A, 0); } return; } void read_stockholm_aln (char *file_name, Alignment *A) { char *tmp_name; Sequence *S; tmp_name=vtmpnam (NULL); if (printf_system ( "clustalw_aln2fasta_aln.pl %s > %s",file_name, tmp_name)!=EXIT_SUCCESS) { printf_exit ( EXIT_FAILURE, stderr, "Could Not Read File %s [FATAL:%s]\n", file_name, PROGRAM); } else { int a; S=get_fasta_sequence ( tmp_name,NULL); for (a=0; anseq; a++) { if (strstr (S->name[a], "_stockholm")) { substitute ( S->name[a], "_stockholmspace_", " "); substitute ( S->name[a], "_stockholmhasch_", "#"); } } A=seq2aln (S, A, 0); } return; } Alignment* read_blast_aln ( char *file_name, Alignment *A) { char *tmp_name; Sequence *S; int type; int a; if ( !(type=is_blast_file (file_name))) { myexit (EXIT_FAILURE); } tmp_name=vtmpnam ( NULL); if (type==BLAST_TXT) { printf_system("cat %s | blast_aln2fasta_aln.pl | fasta_aln2fasta_aln_unique_name.pl >%s", file_name, tmp_name); } else if (type==BLAST_XML) { printf_system("blast_xml2fasta_aln.pl %s >%s", file_name, tmp_name); } main_read_aln (tmp_name, A); return A; } void read_number_aln ( char *file_name, Alignment *A) { FILE *fp, *fp2; int * ptr_aln; int a,b,d; int c; char *buf=NULL; int tot=0; int flag=0; char *fname; int n_comment=0; int nseq=0; int max_len=0; fp=vfopen ( file_name, "r"); fname=vtmpnam(NULL); fp2=vfopen ( fname, "w"); while ( (c=fgetc(fp))!=EOF) { fprintf ( fp2, "%c", c); } vfclose (fp); vfclose (fp2); /*1 Count The number of sequences*/ fp=vfopen ( fname, "r"); buf=vfgets ( buf,fp); if ( !isblanc (buf)); while ( isblanc (buf)) { buf=vfgets ( buf, fp); } while (!isblanc (buf)) { buf=vfgets ( buf,fp); } while ( !isalnum ((c=fgetc(fp)))) { ungetc(c,fp); buf=vfgets ( buf,fp); } if ( c!='\n')ungetc(c,fp); while ( isalnum ((c=fgetc(fp)))) { ungetc(c,fp); a=0; while ( isgraph ((c=fgetc(fp)))); nseq++; buf=vfgets ( buf, fp); } vfclose (fp); /*DONE*/ /*2 get_max_len*/ max_len=count_n_char_in_file(fname)/nseq; A=realloc_alignment2( A, nseq+1, max_len+1); /*DONE*/ fp=vfopen ( fname, "r"); buf=vfgets ( buf, fp); if ( !isblanc (buf))sprintf (A->aln_comment[n_comment++], "%s", buf); while ( isblanc (buf)) { buf=vfgets ( buf,fp); } while (!isblanc (buf)) { buf=vfgets ( buf, fp); sprintf ( A->aln_comment[n_comment++], "%s", buf); } while ( !isalnum ((c=fgetc(fp)))) { ungetc(c,fp); buf=vfgets ( buf, fp); } if ( c!='\n')ungetc(c,fp); while ( isalnum ((c=fgetc(fp)))) { ungetc(c,fp); fscanf_seq_name (fp, A->name[A->nseq]); if ( name_is_in_list (A->name[A->nseq], A->name, A->nseq, 100)!=-1) { fprintf ( stderr, "\nWARNING (read_number_aln): Sequence %s Duplicated in File %s ", A->name[A->nseq], A->file[A->nseq]); if (!getenv("ALLOW_DUPLICATE")) { fprintf ( stderr, " [FATAL:%s]\n", PROGRAM); myexit (EXIT_FAILURE); } } A->nseq++; buf=vfgets ( buf,fp); } vfclose (fp); if ((fp=vfopen ( fname, "r"))==NULL) printf ( "\nCOULDN'T READ %s", fname); ptr_aln=vcalloc ( A->nseq, sizeof(int)); while ( flag==0) { while ( (c=fgetc(fp))!='\n'); if ( (c=fgetc(fp))=='\n') flag=1; } while ( !isalnum(c=fgetc(fp))); ungetc ( c, fp); while ( c!=EOF) { tot=0; while(tot< A->nseq && c!=EOF) { b=0; while ( !isgraph (c=fgetc(fp)) && c!=EOF); if ( c!=EOF)ungetc(c, fp); while ( isgraph((buf[b++]=fgetc(fp)))); buf[b-1]='\0'; for ( a=-1,d=0; d< A->nseq; d++) if ( strcmp (A->name[d], buf)==0) {a=d; tot++; } if ( a==-1) while ( (c=fgetc(fp))!='\n' && c!=EOF); else { while ( (c=fgetc(fp))!='\n') { if ( isgraph(c) || is_gap(c)) {if ( isalpha(c)) c=(A->residue_case==2)?c:tolower(c); if (!isspace(c))A->seq_al[a][ptr_aln[a]++]=c; } } } } while ( !isalnum(c=getc(fp)) && c!=EOF); if ( c!=EOF) ungetc (c, fp); } vfclose (fp); for ( a=0; a< A->nseq; a++) {A->seq_al[a][ptr_aln[a]]='\0'; A->order[a][0]=a; A->order[a][1]=0; } A->len_aln= strlen(A->seq_al[0]); vfree (buf); vfree(ptr_aln); vremove (fname); } void read_amps_aln ( char *in_file, Alignment *A) { FILE *fp; int a, b, c, cont=1; A->nseq=get_amps_seq_name ( A->name, in_file); fp=vfopen ( in_file, "r"); fp=set_fp_id(fp, "1*"); while ( (c=fgetc(fp))!='\n'); b=0; while ( cont==1) { c=fgetc ( fp); c=fgetc(fp); if ( c=='*') { cont=0; for ( a=0; anseq; a++) A->seq_al[a][b]='\0'; A->len_aln=b; } else { ungetc (c, fp); for ( a=0; a< A->nseq; a++) { c=fgetc(fp); if ( c==' ')A->seq_al[a][b]='-'; else { A->seq_al[a][b]=c; A->len[a]++; } } while ((c=fgetc(fp))!='\n'); b++; } } } int get_amps_seq_name ( char **name, char* fname) { FILE *fp; int nseq=0; fp=vfopen ( fname, "r"); fp=set_fp_id ( fp, "Index"); while ( (fgetc(fp))!='\n'); while ( isspace(fgetc(fp))) {fscanf (fp, "%*d >%s", name[nseq++]); while ( (fgetc(fp))!='\n'); } vfclose ( fp); return nseq; } Alignment * read_gotoh_aln ( char *fname, Alignment *A) { FILE *fp; int * ptr_aln; int a,b,d,e; char *buf; char buf2[VERY_LONG_STRING+1]; char buf3[VERY_LONG_STRING+1]; char buf4[VERY_LONG_STRING+1]; int tot=0; int l; int nseq, max_len; if ( !check_file_exists (fname))return NULL; fp=vfopen ( fname, "r"); /*1 GET THE NUMBER OF SEQUENCES*/ nseq=0; buf=vcalloc ( VERY_LONG_STRING+1, sizeof (char)); while ( isblanc (buf=fgets ( buf, VERY_LONG_STRING, fp))); while (!isblanc (buf=fgets ( buf, VERY_LONG_STRING, fp))); while ( isblanc (buf=fgets ( buf, VERY_LONG_STRING, fp))); while ( !isblanc ( buf) && buf!=NULL) { a=-1; d=sscanf ( buf, "%d %s %s %s", &a, buf2, A->name[A->nseq],buf3); if ( a!=-1) { if ( name_is_in_list (A->name[A->nseq], A->name, A->nseq, 100)!=-1) { fprintf ( stderr, "\nWARNING (get_amps_seq_name): Sequence %s Duplicated in File %s ", A->name[A->nseq], A->file[A->nseq]); if (!getenv("ALLOW_DUPLICATE")) { fprintf ( stderr, " [FATAL:%s]\n", PROGRAM); myexit (EXIT_FAILURE); } } nseq++; fgets(buf, VERY_LONG_STRING, fp); } else ( buf=NULL); } vfclose (fp); /*2 Get the MAX Len and Reallocate*/ max_len=count_n_char_in_file(fname)/nseq; A=realloc_aln2( A, nseq+1, max_len+1); /*3 Get The Sequences Names*/ A->nseq=0; fp=vfopen ( fname, "r"); while ( isblanc (buf=fgets ( buf, VERY_LONG_STRING, fp))); while (!isblanc (buf=fgets ( buf, VERY_LONG_STRING, fp))); while ( isblanc (buf=fgets ( buf, VERY_LONG_STRING, fp))); while ( !isblanc ( buf) && buf!=NULL) { a=-1; d=sscanf ( buf, "%d %s %s %s", &a, buf2, A->name[A->nseq],buf3); if ( a!=-1) { if ( d==4)sprintf (A->name[A->nseq],"%s", buf3); A->nseq++; fgets(buf, VERY_LONG_STRING, fp); } else ( buf=NULL); } vfclose (fp); /*READ THE ALN*/ fp=vfopen ( fname, "r"); buf=vcalloc ( VERY_LONG_STRING+1, sizeof (char));; ptr_aln=vcalloc ( A->nseq, sizeof(int)); while ( isblanc (buf=fgets ( buf, VERY_LONG_STRING, fp))); while (!isblanc (buf=fgets ( buf, VERY_LONG_STRING, fp))); while ( isblanc (buf=fgets ( buf, VERY_LONG_STRING, fp))); while (buf!=NULL) { tot=0; while(tot< A->nseq) { e=sscanf (buf, "%d %s %s %s", &e, buf2, buf3, buf4); if ( e==4)sprintf( buf3, "%s", buf4); for ( d=0; d< A->nseq; d++) { if ( strcmp (A->name[d], buf3)==0) {a=d; tot++; } } l=strlen (buf2); if ( buf2[l-1]=='|')l--; buf2[l]='\0'; for (b=0; bseq_al[a][ptr_aln[a]++]=(A->residue_case==2)?buf2[b]:tolower (buf2[b]); } buf=fgets(buf, VERY_LONG_STRING, fp); } if ( buf!=NULL) { buf=fgets(buf, VERY_LONG_STRING, fp); while ( isblanc (buf) && buf!=NULL) { buf=fgets ( buf, VERY_LONG_STRING, fp); } } } vfclose (fp); for ( a=0; a< A->nseq; a++) {A->seq_al[a][ptr_aln[a]]='\0'; } A->len_aln= strlen(A->seq_al[0]); for ( a=0; a< A->nseq; a++) { for ( b=0; b< A->len_aln; b++) A->len[a]+=1-is_gap(A->seq_al[a][b]); } for ( a=0, b=0; a< A->len_aln; a++) { if ( !is_gap(A->seq_al[0][a]) &&!is_gap(A->seq_al[1][a]))b++; } return A; } void read_msf_aln ( char *fname, Alignment *A) { char command[1000]; char *tmp_name; Sequence *S; tmp_name=vtmpnam(NULL); sprintf ( command, "msf_aln2fasta_aln.pl %s > %s", fname, tmp_name); if ( my_system (command)!=EXIT_SUCCESS) { fprintf ( stderr, "\nERROR: file %s does not have a legal msf format [FATAL:%s]", fname,PROGRAM); myexit (EXIT_FAILURE); } S=get_fasta_sequence ( tmp_name,NULL); A=seq2aln (S, A, 0); vremove (tmp_name); return; } /**************************************************************************************************/ /*************************************REFORMAT OUT*************************************************/ /**************************************************************************************************/ /*******************************************************************************************/ /* */ /* */ /* OUTPUT MATRICES */ /* */ /***************************************************************************************** */ int output_freq_mat ( char *outfile, Alignment *A) { /* function documentation: start int output_freq_mat ( char *outfile, Aligmnent *A) This function counts the number of residues in each column of an alignment (Prot) It outputs these values in the following format A | 0 0 0 1 0 B | 1 0 0 0 1 - | 0 1 1 0 0 This format can be piped into: The routine used for computing the p-value gmat-inf-gc-v2c function documentation: end */ int a, b; int **freq_mat; FILE *fp; freq_mat=aln2count_mat (A); fp=vfopen ( outfile, "w"); for ( b=0; b< 26; b++) { fprintf (fp, "%c |", 'A'+b); for ( a=0; a< A->len_aln; a++)fprintf (fp,"%d ", freq_mat[b][a]); fprintf (fp, "\n"); } fprintf (fp, "- |"); for ( a=0; a< A->len_aln; a++)fprintf (fp,"%d ", freq_mat[26][a]); free_int (freq_mat, -1); vfclose ( fp); return 1; } /*******************************************************************************************/ /* */ /* */ /* OUTPUT P-Values */ /* */ /***************************************************************************************** */ float output_maln_pval ( char *outfile, Alignment *A) { /* function documentation: start float output_maln_pval ( char *outfile, Aligmnent *A) This function outputs the p-value of a multiple alignmnet as described in Hertz, Stormo, Bioinformatics, 15-7/8, 563/577 ftp beagle.colorado.edu /pub/cosensus Locally packages/consensus/gmat-inf-gc-v2c The routine used for computing the p-value is the program gmat-inf-gc-v2c function documentation: end */ char *mat; char *result; FILE *fp; float value; char command[LONG_STRING]; char string[STRING]; mat=vtmpnam (NULL); result=vtmpnam (NULL); output_freq_mat (mat,A); sprintf ( command, "more %s | gmat-inf-gc-v2c -A abcdefghijklmnopqrstuvwxyz> %s",mat, result); my_system ( command); if ( !check_file_exists(result))return 0; fp=find_token_in_file ( result, NULL, "ln(p-value):"); fscanf ( fp, "%s",string); value=atof ( string); vfclose ( fp); vremove ( mat); vremove ( result); fp=vfopen ( outfile, "w"); fprintf ( fp, "%.6f\n", value); vfclose ( fp); return value; } /*******************************************************************************************/ /* */ /* */ /* OUTPUT WEIGHTS */ /* */ /***************************************************************************************** */ int output_seq_weights ( Weights *W, char *wfile) { FILE*fp; int a; if ( W==NULL)return 0; fp=vfopen (wfile, "w"); if ( fp==NULL)return 0; for ( a=0; a< W->nseq; a++) { fprintf ( fp, "%s %.2f\n", W->seq_name[a],W->SEQ_W[a]); } vfclose ( fp); return 1; } void output_pw_weights4saga ( Weights *W, float **w_list, char *wfile) { FILE*fp; int a, b; fp=vfopen (wfile, "w"); fprintf ( fp, "%s\n$\n", W->comments); for ( a=0; a< W->nseq-1; a++) { for (b=a+1; b< W->nseq; b++) { fprintf ( fp, "%s %s %f\n", W->seq_name[a], W->seq_name[b],w_list[a][b]); } } fprintf ( fp, "$\n"); vfclose ( fp); } FILE * display_weights (Weights *W, FILE *fp) { int a; int max_len; if ( W==NULL) { fprintf ( fp, "\n\nUN-WEIGHTED MODE: EVERY SEQUENCE WEIGHTS 1\n"); return fp; } fprintf ( fp, "\n\nWEIGHTED MODE:%s\n\n", (W)->mode); for ( a=0, max_len=0; a< W->nseq; a++)max_len=MAX(max_len, strlen (W->seq_name[a])); for ( a=0; a< (W->nseq); a++) { fprintf ( fp, "\t%*s %.2f\n", max_len,(W)->seq_name[a],W->SEQ_W[a]); } fprintf ( fp, "\n"); return fp; } /*******************************************************************************************/ /* */ /* */ /* OUTPUT SEQ */ /* */ /***************************************************************************************** */ int ** input_similarities (char *file, Alignment *A, char *mode) { int a, b, i, n; int **sim; float score; char name[1000]; FILE *fp=NULL; char *buf1=NULL, *buf2=NULL; int new_aln=0; if ( !check_file_exists (file) || !is_distance_matrix_file (file) ||!is_similarity_matrix_file (file) ) { return NULL; } if ( A) { fp=vfopen (file, "r"); while ((buf2=vfgets (buf1,fp))!=NULL ) { if (strstr (buf2, "SEQ_INDEX")) { buf1=buf2; sscanf (buf1, "# SEQ_INDEX %s %d",name, &i); if ( !strm (A->name[i], name)) { return NULL; } } } vfclose (fp); } else { A=similarities_file2aln(file); new_aln=1; } sim=declare_int ( A->nseq, A->nseq); for ( a=0; anseq; a++)sim[a][a]=100; fp=find_token_in_file (file, NULL, "PW_SEQ_DISTANCES"); fp=find_token_in_file (file, fp, "BOT"); while ((buf2=vfgets (buf1,fp))!=NULL ) { if ( !(strstr (buf2, "BOT\t") || strstr (buf2, "TOP\t")))continue; buf1=buf2; n=sscanf (buf1, "%*s %d %d %f", &a, &b, &score); if ( n!=3) { free_int (sim, -1); return NULL; } else sim[a][b]=sim[b][a]=(int)score; } vfclose (fp); vfree (buf1); if (new_aln)free_aln(A); return sim; } Alignment * similarities_file2aln ( char *file) { int nseq=0, i; FILE *fp; char name[1000]; Alignment *A; fp=vfopen (file, "r"); while ((fp=find_token_in_file (file,fp, "SEQ_INDEX")))nseq++; A=declare_aln2 (nseq+1, 10); while ((fp=find_token_in_file (file,fp, "SEQ_INDEX"))) { fscanf (fp, "%s %d", name,&i); sprintf ( A->name[i], "%s", name); } A->nseq=nseq; return A; } void output_similarities (char *file, Alignment *A, char *mode) { float s; float *tot; float bigtot=0; int n, max; FILE *fp; int a, b; char *p; int **M=NULL; for (max=0, a=0; a< A->nseq; a++)max=MAX(max,(strlen (A->name[a]))); tot=vcalloc ( A->nseq, sizeof (float)); fp=vfopen (file, "w"); fprintf (fp, "# TC_SIMILARITY_MATRIX_FORMAT_01\n"); for ( a=0; anseq; a++) fprintf ( fp, "# SEQ_INDEX %s %d\n",A->name[a],a); fprintf ( fp, "# PW_SEQ_DISTANCES \n"); for (n=0,a=0;a< A->nseq-1; a++) { for ( b=a+1; bnseq; b++, n++) { if (strstr (mode, "_sarmat2")) { s=get_sar_sim (A->seq_al[a], A->seq_al[b]); } else if (strstr (mode, "_sar")) { s=get_sar_sim (A->seq_al[a], A->seq_al[b]); } else if ( (p=strstr (mode, "_memory_"))) { int **sim; sscanf ( p, "_memory_%ld", (long int*)&sim); s=sim[a][b]; } else if ( strstr (mode, "_idscore") || strstr ( mode, "_covscore")) { static Sequence *S; if (a==0 && b==1) { free_sequence (S, -1); if ( strstr (mode, "idscoreDNA")) M=read_matrice ("idmat"); else M=read_matrice("blosum62mt"); S=aln2seq(A); } if ( strstr (mode, "_idscore"))s=idscore_pairseq(S->seq[a], S->seq[b], -10,-1, M, "sim"); else s=idscore_pairseq(S->seq[a], S->seq[b], -10,-1, M, "cov"); } else if ( strstr (mode, "cov")) { s=get_seq_sim ( A->seq_al[a], A->seq_al[b],GAP_LIST, "cov"); } else { s=get_seq_fsim2 (A->seq_al[a], A->seq_al[b],GAP_LIST, mode); } fprintf (fp, "BOT\t %4d %4d\t %5.2f %*s\t %*s\t %5.2f\n", a,b,s,max,A->name[a], max, A->name[b], s); fprintf (fp, "TOP\t %4d %4d\t %5.2f %*s\t %*s\t %5.2f\n", b,a,s,max,A->name[b], max, A->name[a], s); tot[a]+=s; tot[b]+=s; bigtot+=s; } } for ( a=0; a< A->nseq; a++) { fprintf (fp, "AVG\t %d\t %*s\t %*s\t %5.2f\n", a,max,A->name[a], max, "*", tot[a]/(A->nseq-1)); } vfree (tot);free_int (M, -1); fprintf (fp, "TOT\t %*s\t %*s\t %5.2f\n", max,"TOT", max, "*", bigtot/n); vfclose (fp); } void output_similarities_pw (char *file, Alignment *A, Alignment *B,char *mode) { float s; float *tot; float bigtot=0; int n, max; FILE *fp; int a, b; int **M=NULL; Sequence *SA, *SB; if ( strstr (mode, "idscoreDNA")) M=read_matrice ("idmat"); else M=read_matrice("blosum62mt"); SA=aln2seq(A); SB=aln2seq(B); for (max=0, a=0; a< A->nseq; a++)max=MAX(max,(strlen (A->name[a]))); for (a=0; a< B->nseq; a++)max=MAX(max,(strlen (B->name[a]))); tot=vcalloc ( A->nseq, sizeof (float)); fp=vfopen (file, "w"); fprintf (fp, "# TC_SIMILARITY_MATRIX_FORMAT_01\n"); for ( a=0; anseq; a++) fprintf ( fp, "# SEQ_INDEX %s %d\n",A->name[a],a); fprintf ( fp, "# PW_SEQ_DISTANCES \n"); for (n=0,a=0;a< A->nseq; a++) { for ( b=0; bnseq; b++, n++) { s=idscore_pairseq(SA->seq[a], SB->seq[b], -10,-1, M, "sim"); fprintf (fp, "BOT\t %4d %4d\t %5.2f %*s\t %*s\t %5.2f\n", a,b,s,max,A->name[a], max, B->name[b], s); fprintf (fp, "TOP\t %4d %4d\t %5.2f %*s\t %*s\t %5.2f\n", b,a,s,max,B->name[b], max, A->name[a], s); tot[a]+=s; tot[b]+=s; bigtot+=s; } } for ( a=0; a< A->nseq; a++) { fprintf (fp, "AVG\t %d\t %*s\t %*s\t %5.2f\n", a,max,A->name[a], max, "*", tot[a]/(A->nseq-1)); } vfree (tot);free_int (M, -1); fprintf (fp, "TOT\t %*s\t %*s\t %5.2f\n", max,"TOT", max, "*", bigtot/n); vfclose (fp); } void output_conservation_statistics ( char *file, Alignment *A) { int a, b, c,c1, c2; double **tot; char aa[1000]; int naa; sprintf (aa, "%s", BLAST_AA_ALPHABET); naa=strlen (aa); tot=declare_double (256, 256); for ( a=0; anseq; a+=2) { b=a+1; for ( c=0; clen_aln; c++) { c1=tolower (A->seq_al[a][c]); c2=tolower (A->seq_al[b][c]); if ( !is_gap(c1) && !is_gap(c2)) { tot[c1][c2]++; tot[c2][c1]++; tot[c1][0]++; tot[c2][0]++; tot[0][0]++; } } } fprintf ( stdout, "# BLAST_MATRIX FORMAT\n#ALPHABET=%s\n",aa); for (a=0; anseq; a++)maxname=MAX(strlen(A->name[a]), maxname); maxname++; fp=vfopen (file, "w"); if (mode[0]=='h') { b=0; while ((c=mode[b++])!='\0') { if ( c=='n') fprintf (fp, "%-*s ",maxname,"name"); if ( c=='l') fprintf (fp, "%-*s ",5,"nres"); if ( c=='g') fprintf (fp, "%-*s ",5,"ngap"); if ( c=='t') fprintf (fp, "%-*s ",5,"len"); } if (is_in_set ( c, "nlgt")) fprintf (fp, "\n"); mode++; } b=0; while ((c=mode[b++])!='\0') { if ( c=='n')break; if ( c=='N'){d=1;fprintf (fp, "NSEQ %d ", A->nseq);} if ( c=='L'){d=1;fprintf (fp, "LEN %d ", A->len_aln);} } if ( d) fprintf (fp, "\n"); for (a=0; anseq; a++) { b=0; d=0; while ((c=mode[b++])!='\0') { if (is_in_set ( c, "nlgt"))d=1; if (c=='n'){d=1;fprintf ( fp, "%-*s ", maxname,A->name[a]);} if (c=='l') { for (n=0,d=0; dlen_aln; d++)n+=!is_gap(A->seq_al[a][d]); fprintf ( fp, "%-5d ",n); } if (c=='g') { for (n=0,d=0; dlen_aln; d++)n+=((is_gap(A->seq_al[a][d]) && !is_gap(A->seq_al[a][d+1]))||(is_gap(A->seq_al[a][d])&& A->seq_al[a][d+1]=='\0')) ; fprintf ( fp, "%-5d ",n); } if (c=='t') { fprintf ( fp, "%-5d ",strlen (A->seq_al[a])); } if (c=='N' && d) { fprintf ( fp, "%-5d ",A->nseq); } if (c=='L'&& d) { fprintf ( fp, "%-5d ",A->len_aln); } } if (d)fprintf ( fp, "\n"); } vfclose (fp); } int output_age_matrix ( char *outfile, int val) { int **mat; int a, b; char alp[]="abcdefghij-"; int naa; mat=declare_int ( 256, 256); naa=strlen (alp); for ( a=0; anseq; a++) { ungap (A->seq_al[a]); lower_string (A->seq_al[a]); s=A->seq_al[a]; l=strlen (s); if ( s[0]=='\0') continue; symbols[(int)s[0]]++; for ( b=1; b< l; b++) { symbols[(int)s[b]]++; table[(int)s[b-1]][(int)s[b]]++; tot++; } } for (naa=0, a=0; a< 256; a++) { if (symbols[a])alp[naa++]=a; } for ( a=0; a< 256; a++) for (b=0; b<256; b++) { if (symbols[a]&& symbols[b] && table[a][b] && tot>0) { freq=(table[a][b])/tot; expected=(symbols[a]*symbols[b])/(tot*tot); log_odd=log (freq/expected); mat[a-'A'][b-'A']=log_odd*10; fmat[a-'A'][b-'A']=log_odd; } else if ( symbols[a]&& symbols[b]) { mat[a-'A'][b-'A']=-999; fmat[a-'A'][b-'A']=-999; } } output_mat ( mat,outfile, alp, 'A'); fp=vfopen (outfile, "a"); for ( a=0; a<256; a++) if ( symbols[a]) { fprintf (fp, "# %c tot: %6d freq: %7.5f\n", a, (int)symbols[a],(float)symbols[a]/tot); } for ( a=0; a< 256; a++) for (b=0; b<256; b++) { if (symbols[a]&& symbols[b]) { freq=(table[a][b])/tot; fprintf (fp, "# %c%c tot: %6d freq: %7.5f log_odd: %9.3f\n", a, b, (int)table[a][b],(float)freq,fmat[a-'A'][b-'A']); } } vfclose (fp); vfree(alp); free_arrayN ((void **)mat, 2); free_arrayN ((void **)fmat, 2); return 1; } void output_est_prf (char *fname, Alignment *A) { int a; FILE *fp; if ( !A->P) { fprintf ( stderr, "\nFormat output_est_prf Impossible: No profile\n"); myexit(EXIT_FAILURE); } fp=vfopen ( fname, "w"); fprintf ( fp, "Consensus Sequence\nReconstructed with %s (%s,%s)\n",PROGRAM,AUTHOR,DATE); fprintf ( fp, "%4c %4c %4c %4c %15s Consensus\n", 'A','G','C','T', "Internal Gaps"); for ( a=0; a< A->len_aln; a++) { fprintf (fp, "%4d %4d %4d %4d %15d %c\n", (A->P)->count[0][a],(A->P)->count[1][a],(A->P)->count[2][a], (A->P)->count[3][a], (A->P)->count[4][a],A->seq_al[0][a]); } return; } void output_gotoh_seq (char *fname, Alignment*A ) { int a; FILE *fp; fp=vfopen ( fname, "w"); fprintf ( fp, "%d %d\n",A->nseq, A->max_len); for ( a=0; a< A->nseq; a++) { ungap ( A->seq_al[a]); fprintf ( fp, ">%s\n", A->name[a]); fp=output_string_wrap ( 50,A->seq_al[a] , fp); fprintf ( fp, "//\n"); } vfclose (fp); } void output_mult_fasta_seq (char *fname, Alignment*A, int n ) { int a; FILE *fp; fp=vfopen (fname, "w"); ungap(A->seq_al[0]); for (a=0; a%s_%d\n%s\n", A->name[0],a+1, A->seq_al[0]); } vfclose (fp); } char * output_fasta_seqX (char *name, char *mode, Sequence *S, Alignment *A, int i) { FILE *fp; if (!name)name=vtmpnam (NULL); fp=vfopen (name, mode); if ( (S && S->nseq<=i) || (A && S->nseq<=i) || (!A && !S)) { fprintf ( stderr, "\nERROR in function reformat:output_fasta_seqX[FATAL:%s]", PROGRAM); myexit (EXIT_FAILURE); } else if ( S) fprintf ( fp, ">%s %s\n%s\n", S->name[i], S->seq_comment[i], S->seq[i]); else if ( A) { ungap (A->seq_al[i]); fprintf ( fp, ">%s %s\n%s\n", A->name[i], A->seq_comment[i], A->seq_al[i]); } vfclose (fp); return name; } void output_fasta_seq1 (char *fname, Alignment*A ) { char seq_name[VERY_LONG_STRING]; int a; FILE *fp; char *extension; for ( a=0; a< A->nseq; a++) { if ( strncmp( fname, "name",4)==0) { if ( (fname+4)[0]!='\0')extension=fname+5; else extension=NULL; sprintf ( seq_name,"%s.%s", A->name[a],(extension==NULL)?"seq":extension); } else sprintf ( seq_name,"%s.seq",A->name[a]); ungap ( A->seq_al[a]); fp=vfopen (seq_name, "w"); fprintf (fp, ">%s %s\n", A->name[a], A->seq_comment[a]); fp=output_string_wrap ( 50, A->seq_al[a],fp); fprintf ( fp, "\n"); vfclose (fp); } } void output_pir_check (char *fname,int nseq, char **comment ) { int a; FILE *fp; if ( fname==NULL)return; fp=vfopen ( fname, "w"); for ( a=0; a< nseq; a++)fprintf (fp, "%s\n", comment[a]); vfclose (fp); } void output_fasta_seq (char *fname, Alignment*A) { main_output_fasta_seq (fname, A, HEADER); } void output_fasta_tree (char *fname, Alignment*A) { int a; FILE *fp; if ( !A || !A->nseq) return; fp=vfopen ( fname, "w"); for ( a=0; a< A->nseq; a++) { fprintf ( fp, ">%s %s\n%s\n", A->name[a], A->seq_comment[a], A->seq_al[a]); } vfclose (fp); } void main_output_fasta_seq (char *fname, Alignment*A,int header ) { int a; FILE *fp; fp=vfopen ( fname, "w"); for ( a=0; a< A->nseq; a++) { ungap(A->seq_al[a]); fprintf ( fp, ">%s", A->name[a]); if (header==HEADER && A->seq_comment[a][0] && !isblanc(A->seq_comment[a]))fprintf (fp," %s\n",A->seq_comment[a]); else fprintf ( fp, "\n"); fp=output_string_wrap ( 50, A->seq_al[a],fp); fprintf ( fp, "\n"); } vfclose (fp); } void output_gor_seq (char *fname, Alignment*A ) { int a; FILE *fp; fp=vfopen ( fname, "w"); for ( a=0; a< A->nseq; a++) { ungap(A->seq_al[a]); fprintf ( fp, "!%s %d \n", A->name[a], (int)strlen(A->seq_al[a])); upper_string ( A->seq_al[a]); fp=output_string_wrap ( 50, A->seq_al[a],fp); fprintf ( fp, "@\n"); } vfclose (fp); } void output_pir_seq (char *fname, Alignment*A ) { int a; for ( a=0; a< A->nseq; a++)ungap(A->seq_al[a]); output_pir_aln (fname, A); } void output_pir_seq1 (char *fname, Alignment*A ) { char seq_name[VERY_LONG_STRING]; int a; FILE *fp; char type[20]; for ( a=0; a< A->nseq; a++) { if ( strm ( get_string_type (A->seq_al[a]),"DNA") || strm ( get_string_type (A->seq_al[a]),"RNA"))sprintf(type, "DL"); else if ( strm ( get_string_type (A->seq_al[a]),"PROTEIN"))sprintf(type, "P1"); sprintf ( seq_name,"%s;%s_%s.seq",type, fname,A->name[a]); ungap ( A->seq_al[a]); fp=vfopen (seq_name, "w"); fprintf (fp, ">%s\n\n", A->name[a]); fp=output_string_wrap ( 50, A->seq_al[a],fp); fprintf ( fp, "\n*\n"); vfclose (fp); } } /*******************************************************************************************/ /* */ /* */ /* OUTPUT ALN */ /* */ /***************************************************************************************** */ void output_mocca_aln (char *outfile, Alignment *A, Alignment *S) { FILE *fp; int **score; char **new_name_order; int a, maxl; score=declare_int (S->nseq, 2); new_name_order=declare_char ( S->nseq,MAXNAMES+1); for ( a=0; anseq; a++) { score[a][0]=a; score[a][1]=S->score_seq[a]; } sort_int_inv (score+1,2,1,0,S->nseq-2); for ( a=0; anseq; a++) { sprintf ( new_name_order[a], "%s", A->name[score[a][0]]); } A=reorder_aln (A, new_name_order, A->nseq); fp=vfopen (outfile, "w"); fprintf ( fp, "MOCCA,(%s,%s, C. Notredame)\nSCORE %d\nNSEQ %d\nLEN %d\n",VERSION,DATE, A->score_aln, A->nseq, A->len_aln); maxl=return_maxlen ( new_name_order, A->nseq); for (a=0; a< A->nseq; a++) { fprintf (fp, "%-*s: %3d\n", maxl, A->name[a], score[a][1]); } fprintf ( fp, "\n"); fp=output_Alignment_without_header ( A, fp); vfclose (fp); free_int (score, -1); free_char (new_name_order, -1); return ; } void print_sub_aln ( Alignment *B, int *ns, int **ls) { Alignment *X; int a, b; X=copy_aln (B, NULL); X->nseq=0; X->len_aln=strlen ( B->seq_al[ls[0][0]]); for (a=0; a< 2; a++) for ( b=0; bnseq++) { sprintf ( X->seq_al[X->nseq], "%s", B->seq_al[ls[a][b]]); sprintf ( X->name[X->nseq], "%s", B->name[ls[a][b]]); } X->name[X->nseq][0]='\0'; print_aln (X); free_aln (X); } void print_aln ( Alignment *B) { while(B) { output_Alignment_without_header ( B, stderr); B=B->A; } } FILE * output_aln ( Alignment *B, FILE *fp){return output_Alignment(B, fp);} FILE * output_Alignment ( Alignment *B, FILE *fp) { fprintf ( fp, "%s, %s (%s) [%s] [MODE: %s]\n%s\nCPU %d sec\nSCORE %d\nNSEQ %d\nLEN %d\n",PROGRAM,VERSION,DATE,retrieve_mode(),URL,AUTHOR, (B->cpu+get_time())/1000, B->score_aln, B->nseq, B->len_aln); return output_Alignment_without_header ( B, fp); } FILE * output_Alignment_without_header ( Alignment *B, FILE *fp) { int a,b, c; int max_len=0; int line; int *n_residues; char s; if (fp==NULL)return fp; for ( a=0; a< B->nseq; a++) {if ( strlen (B->name[a])>max_len) max_len= strlen ( (B->name[a])); } max_len=MAX(max_len+2, 16); line=get_msa_line_length (0, 0); n_residues=vcalloc ( B->nseq+1, sizeof (int)); for ( a=0; anseq; a++)n_residues[a]=(B->output_res_num==2)?B->order[a][1]:0; fprintf ( fp, "\n"); for (a=0; alen_aln; a+=line) {for (b=0; b<=B->nseq; b++) { fprintf (fp,"%-*s",max_len,B->name[b]); if (B->output_res_num)fprintf (fp, " %4d ", n_residues[b]+1); for (c=a;clen_aln;c++) { if (b==B->nseq){n_residues[b]++;s=analyse_aln_column ( B, c);} else {n_residues[b]+=!is_gap(B->seq_al[b][c]); s=GET_CASE(B->residue_case, B->seq_al[b][c]); } fprintf (fp,"%c",s ); } if (B->output_res_num)fprintf (fp, " %4d", n_residues[b]); fprintf (fp,"\n"); } fprintf (fp,"\n"); } fprintf (fp,"\n\n"); vfree (n_residues); return fp; } FILE * output_aln_score ( Alignment *B, FILE *fp){return output_Alignment_score(B, fp);} FILE * output_Alignment_score ( Alignment *B, FILE *fp) { int a, b, c; static int max_len=0; static int line; int ch; if (fp==NULL)return fp; if ( max_len==0) { for ( a=0; a< B->nseq; a++) {if ( strlen (B->name[a])>max_len) max_len= strlen ( (B->name[a])); } max_len+=4; } line=get_msa_line_length(0, 0); sprintf (B->name[B->nseq], "CONS"); fprintf ( fp, "T_COFFEE ALIGNMENT\nCPU TIME:%d sec.\n", (B->cpu+get_time())/1000); fprintf ( fp, "SCORE=%d\n", B->score_aln); for ( a=0;anseq; a++)fprintf ( fp, "%s: %d\n", B->name[a], B->score_seq[a]); fprintf ( fp, "\n"); for (a=0; alen_aln; a+=line) {for (b=0; bnseq; b++) { fprintf (fp,"%-*s",max_len,B->name[b]); for (c=a;clen_aln;c++) { ch=B->seq_al[b][c]; if (ch==NO_COLOR_RESIDUE)fprintf (fp,"-"); else if ( ch==NO_COLOR_GAP)fprintf (fp,"*"); else if ( ch<10 && ch>=0)fprintf (fp,"%d",ch); else if ( ch>10)fprintf (fp,"#"); else if ( ch<0)fprintf (fp,"."); else fprintf (fp,"9"); } fprintf (fp,"\n"); } fprintf (fp,"\n"); fprintf (fp,"%-*s",max_len,B->name[b]); for (c=a;clen_aln;c++) { ch=B->seq_al[b][c]; if (ch==NO_COLOR_RESIDUE)fprintf (fp,"-"); else if ( ch==NO_COLOR_GAP)fprintf ( fp, "*"); else if ( ch<10 && ch>=0)fprintf (fp,"%d",ch); else if ( ch>10)fprintf (fp,"#"); else if ( ch<0)fprintf (fp,"."); else fprintf (fp,"9"); } fprintf (fp,"\n\n\n"); } fprintf (fp,"\n\n"); return fp; } FILE * output_aln_with_res_number ( Alignment *B, FILE *fp){return output_Alignment_with_res_number(B, fp);} FILE * output_Alignment_with_res_number ( Alignment *B, FILE *fp) { int a, b, c; static int max_len=0; static int line; int**order; if (fp==NULL)return fp; if ( max_len==0) { for ( a=0; a< B->nseq; a++) {if ( strlen (B->name[a])>max_len) max_len= strlen ( (B->name[a])); } max_len+=4; line=60; } order=copy_int ( B->order,declare_int ( B->nseq, 2), B->nseq, 2); fprintf ( fp, "T_COFFEE ALIGNMENT\nCPU TIME:%d sec.\n", (B->cpu+get_time())/1000); fprintf ( fp, "\n"); for (a=0; alen_aln; a+=line) {for (b=0; bnseq; b++) { fprintf (fp,"%-*s %3d %4d ",max_len,B->name[b], order[b][0], order[b][1] ); for (c=a;clen_aln;c++) { order[b][1]+=1-is_gap(B->seq_al[b][c]); fprintf (fp,"%c",toupper(B->seq_al[b][c]) ); } fprintf (fp," %4d\n", order[b][1] ); } fprintf (fp,"\n"); } fprintf (fp,"\n\n"); free_int (order, -1); return fp; } void output_constraints ( char *fname, char *mode,Alignment *A) { FILE *fp; Constraint_list *CL; char *buf; char **name_list; if ( !A->CL || strm ( mode, "pdb")) { if (!A->S) { A->S=aln2seq(A); } CL=declare_constraint_list ( A->S, NULL, NULL, 0, NULL, NULL); CL=aln2constraint_list (A,CL, mode); compact_list (CL, 0, CL->ne, "default"); fp=save_constraint_list ( CL, 0, CL->ne,fname, NULL, "lib",A->S); vfclose (fp); free_constraint_list (CL); return; } else if ( strncmp ( mode, "extended_pair", 13)==0) { buf=duplicate_string (mode+14); name_list=vcalloc(2, sizeof(char*)); name_list[0]=strtok (buf,"_"); name_list[1]=strtok (NULL,"_"); mode[13]='\0'; CL=A->CL; compact_list (CL, 0, CL->ne, "default"); fp=save_sub_list_header (vfopen(fname, "w"),2, name_list,CL); fp=save_extended_constraint_list_pair (CL, "pair",name_list[0],name_list[1],fp); fp=save_list_footer (fp, CL); vfree (buf); } else if ( strm2 (mode, "extended_lib","extended_cosmetic")) { CL=A->CL; compact_list (CL, 0, CL->ne, "default"); fp=save_extended_constraint_list ( CL,mode+9, vfopen(fname, "w")); } else { CL=(Constraint_list *)A->CL; compact_list (CL, 0, CL->ne, "default"); fp=save_constraint_list ( CL, 0, CL->ne,fname, NULL, "lib",A->S); } vfclose ( fp); if ( (Constraint_list *)A->CL !=CL)free_constraint_list (CL); return; } void output_model_aln (char *fname, Alignment*A ) { FILE *fp; int a; Dp_Model *M; Dp_Result *R; char *string; if ( A->Dp_result==NULL) { fprintf ( stderr, "\nWARNING Could Not Output Model %s [%s]", fname, PROGRAM); } R=A->Dp_result; M=R->Dp_model; fp=vfopen ( fname, "w"); for (a=0; anstate; a++) { if (M->model_comments[a][0])fprintf ( fp, "#STATE %c: %s\n", 'a'+a, M->model_comments[a]); } string=vcalloc ( R->len+1, sizeof (char)); for (a=0; alen; a++)string[a]=R->traceback[a]+'a'; fprintf ( fp, ">%s\n",fname); fp=output_string_wrap ( 50,string, fp); vfree(string); fprintf ( fp, "\n"); vfclose (fp); return; } char * output_fasta_sub_aln (char *fname, Alignment*A, int ns, int *ls ) { int a,s; FILE *fp; if (fname==NULL)fname=vtmpnam (NULL); fp=vfopen (fname, "w"); for (a=0; a%s %s\n%s\n", A->name[s],A->seq_comment[s],A->seq_al[s]); } vfclose (fp); return fname; } char * output_fasta_sub_aln2 (char *fname, Alignment*A, int *ns, int **ls ) { int a,g,s; FILE *fp; if (fname==NULL)fname=vtmpnam (NULL); fp=vfopen (fname, "w"); for ( g=0; g<2; g++) for (a=0; a%s %s\n%s\n", A->name[s],A->seq_comment[s],A->seq_al[s]); } vfclose (fp); return fname; } int output_suchard_aln (char *out_file, Alignment *A) { int a, b, c, d; FILE *fp; A=back_translate_dna_aln (A); for ( c=0,a=0; alen_aln; a++, c++) { if (c==3)c=0; for (b=0; bnseq; b++) { if (c==2) { A->seq_al[b][a]='-'; } } } A=ungap_aln_n (A, 1); fp=vfopen (out_file, "w"); for ( a=0; a< A->nseq; a++) { for (b=0; b< A->len_aln; b++) { c=tolower(A->seq_al[a][b]); if ( c=='a')d=1; else if ( c=='g')d=2; else if ( c=='c')d=3; else if ( c=='t')d=4; else if ( c=='u')d=5; else d=6; fprintf ( fp, "%d", d); } fprintf ( fp, "\n"); } vfclose (fp); exit (EXIT_SUCCESS); } void output_fasta_aln (char *fname, Alignment*A ) { FILE *fp; int a; int line=0; line=get_msa_line_length (line, A->len_aln+1); fp=vfopen ( fname, "w"); for ( a=0; a< A->nseq; a++) { fprintf ( fp, ">%s", A->name[a]); if ( A->seq_comment[a][0] && !isblanc (A->seq_comment[a]))fprintf ( fp, " %s", A->seq_comment[a]); fprintf ( fp, "\n"); fp=output_string_wrap ( line,A->seq_al[a] , fp); fprintf ( fp, "\n"); } vfclose (fp); } void output_pir_aln (char *fname, Alignment*A ) { int a; FILE *fp; char type[20]; fp=vfopen ( fname, "w"); for ( a=0; a< A->nseq; a++) { if ( strm ( get_string_type (A->seq_al[a]),"DNA") || strm ( get_string_type (A->seq_al[a]),"RNA"))sprintf(type, "DL"); else if ( strm ( get_string_type (A->seq_al[a]),"PROTEIN"))sprintf(type, "P1"); fprintf ( fp, ">%s;%s\n%s\n",type, A->name[a], A->seq_comment[a]); fp=output_string_wrap ( 50,A->seq_al[a] , fp); fprintf ( fp, "\n*\n"); } vfclose (fp); } int landscape_msa; int set_landscape_msa (int len) { if ( len==0)landscape_msa=-1; else { landscape_msa=len; } return landscape_msa; } int get_msa_line_length (int line, int aln_len) { if (landscape_msa==-1) return aln_len; else if ( landscape_msa)return landscape_msa; else if (line) return line; else { return (getenv ("ALN_LINE_LENGTH"))?atoi(getenv("ALN_LINE_LENGTH")):ALN_LINE_LENGTH; } } void output_msf_aln (char *fname,Alignment *B) { int a, b, c; char *seq; int *all_checks; int i,j; long grand_checksum; FILE *fp; int max_len; int line=0; int block=10; int c_block; char aa; line=get_msa_line_length (line, B->len_aln+1); for ( max_len=0,a=0; a< B->nseq; a++)max_len= MAX(strlen ( B->name[a]),max_len); max_len+=5; fp=vfopen (fname, "w"); seq =vcalloc(B->len_aln, sizeof(char)); all_checks =vcalloc(B->nseq, sizeof(int)); for ( i=0; i< B->nseq; i++) { for ( j=0; jlen_aln; j++) { if ( is_gap(B->seq_al[i][j]))seq[j]='.'; else seq[j]=B->seq_al[i][j]=toupper(B->seq_al[i][j]); } all_checks[i] = SeqGCGCheckSum(seq, (int)B->len_aln); } grand_checksum = 0; for(i=0; inseq; i++) grand_checksum += all_checks[i]; grand_checksum = grand_checksum % 10000; fprintf(fp,"PileUp\n\n"); B=get_aln_type(B); fprintf(fp,"\n\n MSF:%5d Type: ",B->len_aln); if(strm ( (B->S)->type, "DNA") || strm ( (B->S)->type, "RNA")) fprintf(fp,"N"); else fprintf(fp,"P"); fprintf(fp," Check:%6ld .. \n\n", (long)grand_checksum); for (i=0; i< B->nseq; i++) { fprintf ( fp, " Name: %s oo Len:%5d Check:%6ld Weight: %.3f\n", B->name[i], B->len_aln,(long)all_checks[i],(B->S)->W?((B->S)->W)->SEQ_W[i]:1.00); } fprintf(fp,"\n//\n\n"); for (a=0; alen_aln; a+=line) { fprintf ( fp,"\n\n"); for (b=0; bnseq; b++) { fprintf (fp,"%-*s ",max_len,B->name[b]); for (c_block=0,c=a;clen_aln;c++) { if ( c_block==block) { fprintf (fp, " "); c_block=0; } c_block++; aa=(is_gap(B->seq_al[b][c]))?'.': toupper(B->seq_al[b][c]); fprintf (fp,"%c",aa ); } if ( c_block==block) { fprintf (fp, " "); c_block=0; } fprintf (fp,"\n"); } } fprintf ( fp,"\n"); vfclose ( fp); vfree(seq); vfree(all_checks); return; } int SeqGCGCheckSum(char *seq, int len) { int i; long check; for( i=0, check=0; i< len; i++,seq++) check += ((i % 57)+1) * toupper(*seq); return(check % 10000); } void old_output_msf_aln (char *fname,Alignment *B) { FILE *fp; static int *put_seq; int a, b, c; int line=0; char aa; char *buf; int max_len; int seq_max_len; line=get_msa_line_length (line, B->len_aln+1); for ( max_len=0,a=0; a< B->nseq; a++)max_len= MAX(strlen ( B->name[a]),max_len); for ( seq_max_len=0,a=0; a< B->nseq; a++)seq_max_len= MAX(strlen ( B->seq_al[a]),max_len); buf=vcalloc(seq_max_len+1, sizeof (int)); if ( put_seq==NULL) put_seq= vcalloc ( B->nseq, sizeof (int)); put_seq[0]=1; for ( b=1; b< B->nseq; b++) { sprintf ( buf, "%s", B->seq_al[b]); ungap(buf); put_seq[b]=( strlen (buf)>0)?1:0; } fp=vfopen ( fname, "w"); fprintf ( fp, "MSF: %d Type P Check: 5083 ..\n", B->len_aln); for ( a=0; a< B->nseq; a++) { if ( put_seq[a]==1) fprintf ( fp,"Name: %s\n",B->name[a]); } fprintf ( fp, "//\n"); for (a=0; alen_aln; a+=line) {for (b=0; bnseq; b++) { if ( put_seq[b]==1) { fprintf (fp,"%-*s ",max_len,B->name[b]); for (c=a;clen_aln;c++) { aa=(B->seq_al[b][c]=='-')?'.': toupper(B->seq_al[b][c]); fprintf (fp,"%c",aa ); } fprintf (fp,"\n"); } } fprintf (fp,"\n"); } fprintf ( fp,"\n\n"); vfclose ( fp); vfree (buf); vfree(put_seq); } void output_saga_aln ( char *name, Alignment *B) { int a, b, c; FILE *fp; int max_len; int line=0; line=get_msa_line_length (line, B->len_aln+1); for ( max_len=0,a=0; a< B->nseq; a++)max_len= (strlen ( B->name[a])>max_len)?(strlen ( B->name[a])):max_len; fp= vfopen ( name, "w"); fprintf (fp, "\nSAGA FORMAT\nalignement %s nseq=%d len=%d\n", name, B->nseq, B->len_aln); fprintf (fp, "\n\n"); for (a=0; alen_aln; a+=line) {for (b=0; bnseq; b++) {fprintf (fp,"%-*s ",max_len,B->name[b]); for (c=a;clen_aln;c++) { fprintf (fp,"%c",(B->seq_al[b][c]) ); } fprintf (fp,"\n"); } fprintf (fp,"\n"); } fprintf (fp,"\n\n"); vfclose ( fp); } void output_compact_aln ( char *name, Alignment *B) { int a, b, c; FILE *fp; int do_print=0; int max_len; int line=0; line=get_msa_line_length (line, B->len_aln+1); for ( max_len=0,a=0; a< B->nseq; a++)max_len= (strlen ( B->name[a])>max_len)?(strlen ( B->name[a])):max_len; fp= vfopen ( name, "w"); fprintf (fp, "\nSAGA FORMAT\nalignement %s nseq=%d len=%d", name, B->nseq, B->len_aln); fprintf (fp, "\n\n"); for (a=0; alen_aln; a+=line) {for (b=0; bnseq; b++) { for ( do_print=0, c=a;clen_aln;c++) do_print+=1-is_gap(B->seq_al[b][c]); if ( do_print>0) { fprintf (fp,"%-*s ",max_len,B->name[b]); for (c=a;clen_aln;c++) { if ( is_gap(B->seq_al[b][c])&& B->seq_al[b][c]!='-' )fprintf (fp,"%c", '-'); else fprintf (fp,"%c",(B->seq_al[b][c]) ); } fprintf (fp,"\n"); } } fprintf (fp,"\n"); } fprintf (fp,"\n\n"); vfclose ( fp); } void output_clustal_aln ( char *name, Alignment *B) { return output_generic_clustal_aln (name, B, "tc_clustal"); } void output_strict_clustal_aln ( char *name, Alignment *B) { return output_generic_clustal_aln (name, B, "strict_clustal"); } void output_generic_clustal_aln ( char *name, Alignment *B, char *mode) { int a, b, c; FILE *fp; int max_len=0; int line=0; int *n_residues; if ( getenv ("SEP_4_TCOFFEE")) { while ( linelen_aln && B->seq_al[0][line]!='o' && B->seq_al[0][line]!='O')line++; if ( B->seq_al[0][line]=='O' || B->seq_al[0][line]=='o')line++; } else { while ( linelen_aln)line++; } if ( line==B->len_aln)line=get_msa_line_length (0, B->len_aln+1); n_residues=vcalloc ( B->nseq+1, sizeof (int)); for ( a=0; a< B->nseq; a++) {if ( strlen (B->name[a])>max_len) max_len= strlen ( (B->name[a])); n_residues[a]=B->order[a][1]; } max_len=MAX(max_len+2, 16); fp= vfopen ( name, "w"); if ( strm (mode, "strict_clustal")) fprintf ( fp, "CLUSTAL W (1.83) multiple sequence alignment"); else fprintf (fp, "CLUSTAL FORMAT for %s %s [%s] [MODE: %s ], CPU=%.2f sec, SCORE=%d, Nseq=%d, Len=%d ", PROGRAM, VERSION,URL, retrieve_mode (),(float)(B->cpu+get_time())/1000, B->score_aln, B->nseq, B->len_aln); fprintf (fp, "\n\n"); if ( B->len_aln==0) { for (b=0; b<=B->nseq; b++) fprintf (fp,"%-*s -\n",max_len, B->name[b]); } else { for (a=0; alen_aln; a+=line) {for (b=0; b<=B->nseq; b++) { if (b!=B->nseq) { fprintf (fp,"%-*s",max_len, B->name[b]); for (c=a;clen_aln;c++) { if ( is_gap(B->seq_al[b][c]))fprintf (fp,"%c", '-'); else { n_residues[b]++; fprintf (fp, "%c", GET_CASE(B->residue_case, B->seq_al[b][c])); } } if (B->output_res_num)fprintf (fp, " %d", n_residues[b]); fprintf (fp,"\n"); } else if ( b==B->nseq) { fprintf (fp,"%-*s",max_len," "); for (c=a;clen_aln;c++) { fprintf ( fp, "%c", analyse_aln_column (B, c)); } fprintf (fp,"\n"); } } fprintf (fp,"\n"); } } fprintf (fp,"\n\n"); vfree (n_residues); vfclose ( fp); } FILE * output_generic_interleaved_aln (FILE *fp, Alignment *B, int line, char gap, char *mode) { int a, b, c; int max_len=0; int *n_residues; n_residues=vcalloc ( B->nseq+1, sizeof (int)); for ( a=0; a< B->nseq; a++) {if ( strlen (B->name[a])>max_len) max_len= strlen ( (B->name[a])); n_residues[a]=B->order[a][1]; } max_len=MAX(max_len+2, 16); if ( B->len_aln==0) { for (b=0; b<=B->nseq; b++) fprintf (fp,"%-*s -\n",max_len, B->name[b]); } else { for (a=0; alen_aln; a+=line) {for (b=0; b<=B->nseq; b++) { if (b!=B->nseq) { fprintf (fp,"%-*s",max_len, B->name[b]); for (c=a;clen_aln;c++) { if ( is_gap(B->seq_al[b][c]))fprintf (fp,"%c", gap); else { n_residues[b]++; fprintf (fp, "%c", GET_CASE(B->residue_case, B->seq_al[b][c])); } } if (B->output_res_num)fprintf (fp, " %d", n_residues[b]); fprintf (fp,"\n"); } } fprintf (fp,"\n"); } } vfree (n_residues); return fp; } void output_phylip_aln ( char *name, Alignment *B) { int a, b, c, d; FILE *fp; int *print_name; static int line=0; line=get_msa_line_length(0, 0); print_name=vcalloc ( B->nseq, sizeof (int)); fp= vfopen ( name, "w"); fprintf (fp, "%3d %d\n", B->nseq, B->len_aln); for (a=0; alen_aln; a+=line) {for (b=0; bnseq; b++) {if ( print_name[b]==0) { fprintf (fp,"%-10.10s ",B->name[b]); print_name[b]=1; } else { fprintf (fp, "%10.10s ", " "); } for (d=0,c=a;clen_aln;c++, d++) { if ( d==10) { fprintf ( fp, " "); d=0; } if ( is_gap(B->seq_al[b][c])&& B->seq_al[b][c]!='-' )fprintf (fp,"%c", '-'); else fprintf (fp,"%c",(B->seq_al[b][c]) ); } fprintf (fp,"\n"); } fprintf (fp,"\n"); } fprintf (fp,"\n\n"); vfclose ( fp); } void output_rnalign (char *out_file, Alignment *A, Sequence *STRUC) { int a, b; FILE *fp; char bank_file[100]; char pep_file[100]; char *buf; sprintf ( bank_file, "%s.mss", out_file); sprintf ( pep_file, "%s.one_rna", out_file); buf=vcalloc ( strlen ( A->seq_al[0]+1), sizeof (char)); for ( b=0,a=0; a< strlen(A->seq_al[0]); a++) { if ( is_gap(A->seq_al[0][a])) buf[a]='.'; else buf[a]=STRUC->seq[0][b++]; } buf[a]='\0'; fp=vfopen ( bank_file, "w"); fprintf ( fp, "ST\n"); fp=output_string_wrap ( 50, buf, fp); fprintf ( fp, "\n\n"); for ( a=0; anseq-1; a++) { fprintf ( fp, "AS %s\n ", A->name[a]); fp=output_string_wrap ( 50, A->seq_al[a], fp); fprintf ( fp, "\n\n"); } vfclose ( fp); fp=vfopen ( pep_file, "w"); fprintf ( fp, ">%s\n", A->name[A->nseq-1]); fp=output_string_wrap ( 50, A->seq_al[A->nseq-1], fp); fprintf ( fp, "\n"); vfclose (fp); } void output_lib (char *pw_lib_saga_aln_name, Alignment *A ) { Alignment *B; char fname[VERY_LONG_STRING]; int a,b; B=declare_Alignment (NULL); B->nseq=2; for ( a=0; a< A->nseq-1; a++) { for ( b=a+1; bnseq; b++) { sprintf ( B->seq_al[0], "%s", A->seq_al[a]); sprintf ( B->name[0], "%s", A->name[a]); sprintf(B->name[1], "%s", A->name[b]); sprintf ( B->seq_al[1], "%s",A->seq_al[b]); B->nseq=2; sprintf ( fname, "%s_%s_%s.lib",pw_lib_saga_aln_name, A->name[a], A->name[b]); B->len_aln=strlen ( B->seq_al[0]); ungap_aln (B); output_clustal_aln (fname,B); } } } void output_pw_lib_saga_aln (char *pw_lib_saga_aln_name, Alignment *A ) { Alignment *B; char fname[VERY_LONG_STRING]; int a,b; B=declare_Alignment (NULL); B->nseq=2; for ( a=0; a< A->nseq-1; a++) { for ( b=a+1; bnseq; b++) { sprintf ( B->seq_al[0], "%s", A->seq_al[a]); sprintf ( B->name[0], "%s", A->name[a]); sprintf(B->name[1], "%s", A->name[b]); sprintf ( B->seq_al[1], "%s",A->seq_al[b]); B->nseq=2; sprintf ( fname, "%s_%s_%s.pw_lib_saga_aln",pw_lib_saga_aln_name, A->name[a], A->name[b]); B->len_aln=strlen ( B->seq_al[0]); ungap_aln (B); output_clustal_aln (fname,B); } } } void output_lalign_header( char *name, Alignment *A) { FILE *fp; fp=vfopen ( name, "w"); fprintf ( fp, " Lalign mode: best local alignments between two sequences\n"); fprintf ( fp, " %s(%s) [%s]\n\n", VERSION, DATE, URL); fprintf ( fp, " Comparison of:\n(A) %s\t%s\t-%d aa\n", (A->S)->file[A->order[0][0]],(A->S)->name[A->order[0][0]], (A->S)->len[A->order[0][0]]); fprintf ( fp, "(B) %s\t%s\t-%d aa\n", (A->S)->file[A->order[1][0]],(A->S)->name[A->order[1][0]], (A->S)->len[A->order[1][0]]); vfclose ( fp); return; } void output_stockholm_aln (char *file, Alignment *A, Alignment *ST) { FILE *fp; int a,b,l; for (a=0; anseq; a++) for (b=0; blen_aln; b++) if (A->seq_al[a][b]==STOCKHOLM_CHAR)A->seq_al[a][b]='.'; fp=vfopen (file, "w"); fprintf ( fp, "# STOCKHOLM 1.0\n\n"); output_generic_interleaved_aln (fp,A, 50, '.', NULL); fprintf ( fp, "//\n"); vfclose (fp); } void output_glalign ( char *name, Alignment *B, Alignment *S) { int a, b, g, s; int naln=0; FILE *fp; int **nr; B=B->A; if ( B==NULL){return;} fp=vfopen (name, "w"); fprintf (fp, "Format: GLALIGN_01 [Generated with %s ]\n", PROGRAM); fprintf (fp, "#Each Line corresponds to a column\n"); fprintf (fp, "#First column coresponds to first genome\n"); fprintf (fp, "#Last Column gives the column reliability on a 0-9 scale\n"); fprintf (fp, "#[-1] Indicates that the reliability was not evaluated\n"); fprintf (fp, "Genome List\n"); for ( a=0; a< B->nseq; a++) fprintf (fp, "\tGenome %s\n", B->name[a]); fprintf (fp, "Alignment List\n"); while (B) { fprintf (fp, "Alignment %d Len %d Score %d\n", ++naln, B->len_aln, S->score_aln); nr=duplicate_int (B->order, -1, -1); for ( a=0; a< B->len_aln; a++) { fprintf ( fp, "\t"); for ( b=0; b< B->nseq; b++) { g=is_gap (B->seq_al[b][a]); nr[b][1]+=1-g; if (g)fprintf (fp, "---- "); else fprintf ( fp, "%4d ",nr[b][1]); } s=((S)?S->seq_al[S->nseq][a]:-1); if (s==NO_COLOR_RESIDUE)s=-1; fprintf ( fp,"[ %d ]",s); fprintf ( fp, "\n"); } free_int (nr, -1); B=B->A; S=S->A; } vfclose ( fp); } Alignment *input_conc_aln ( char *name, Alignment *IN) { FILE *fp; char *string, *p, *file; Alignment *F=NULL,*A=NULL, *B=NULL; file=vtmpnam (NULL); string=file2string(name); string=substitute ( string, "@", "!Protected!"); string=substitute ( string, TC_REC_SEPARATOR, "@"); strtok (string,"@"); while ( (p=strtok (NULL,"@"))!=NULL) { char *buf; buf=vcalloc ( strlen (p)+1, sizeof (char)); sprintf (buf,"%s", p); buf=substitute (buf,"!protected!", "@"); fp=vfopen (file, "w"); fprintf ( fp, "%s",buf); vfclose (fp); vfree (buf); if ( is_aln (file)) { B=main_read_aln (file,NULL); if ( !A) { if (IN){copy_aln (B, IN);F=A=IN;} else F=A=B; } else { A->A=B; A=A->A; } } } vfree (string); return F; } void output_conc_aln ( char *name, Alignment *B) { FILE *fp; int a; fp=vfopen (name, "w"); fprintf (fp, "# CONC_MSF_FORMAT_01\n"); while (B) { fprintf (fp, "%s\n", TC_REC_SEPARATOR); for ( a=0; a< B->nseq; a++) { fprintf ( fp, ">%s\n%s\n", B->name[a], B->seq_al[a]); } B=B->A; } vfclose (fp); } void output_lalign ( char *name, Alignment *B) { static int output_header; B=B->A; if ( B==NULL){output_header=0;return;} else if ( output_header==0) { output_lalign_header(name, B); output_header=1; } while (B) { output_lalign_aln ( name, B); B=B->A; } } void output_lalign_aln ( char *name, Alignment *B) { int a, b, c,d=0, s=0; char col; float tot=0; float id=0; FILE *fp; int max_len=0; int line; int *n_residues; int res; n_residues=vcalloc ( B->nseq+1, sizeof (int)); for ( a=0; a< B->nseq; a++) {if ( strlen (B->name[a])>max_len) max_len= strlen ( (B->name[a])); n_residues[a]=B->order[a][1]; } max_len=MAX(max_len+2, 16); line=60; fp= vfopen ( name, "a"); for (a=0; a< B->len_aln; a++) { if ( !is_gap(B->seq_al[0][a]) && !is_gap(B->seq_al[1][a])) { tot++; id+=(B->seq_al[0][a]==B->seq_al[1][a]); } } id=(id*100)/tot; fprintf (fp, " %.1f%% identity in %d aa overlap; score: %d\n\n", id,(int)tot, B->score_aln); for (a=0; alen_aln; a+=line) {for (b=0; b<5; b++) { if ( b==0 || b==4) { if ( b==0)s=0; if ( b==4)s=1; fprintf (fp,"%-*s",max_len," "); for (d=0,c=a;clen_aln;c++) { res=!is_gap ( B->seq_al[s][c]); n_residues[s]+=res; if ( (n_residues[s]%10)==0 && res && (c-a+4)name[s]); for (c=a;clen_aln;c++) { if ( is_gap(B->seq_al[s][c]))fprintf (fp,"%c", '-'); else { fprintf (fp, "%c", GET_CASE(B->residue_case, B->seq_al[s][c])); } } fprintf (fp,"\n"); } else if ( b==2) { fprintf (fp,"%-*s",max_len," "); for (c=a;clen_aln;c++) { col=analyse_aln_column (B, c); if ( col=='*')col=':'; else if ( col==':')col='.'; else if ( col=='.')col=' '; fprintf ( fp, "%c", col); } fprintf (fp,"\n"); } } fprintf (fp,"\n"); } fprintf (fp,"\n\n----------\n\n"); vfree (n_residues); vfclose ( fp); } /****************************************************************************************************/ /*************************************UTIL *********************************************************/ /**************************************************************************************************/ /****************************************************************************************************/ /*************************** *************************************/ /*************************** PROCESSING *************************************/ /*************************** *************************************/ /*******************************************************************************************/ /* */ /* */ /* THREADING */ /***************************************************************************************** */ char *thread_aa_seq_on_dna_seq( char *s) { int l, b, c; char *array; l=strlen ( s); array=vcalloc ( l*3 +1, sizeof (char)); for ( b=0, c=0; b< l; b++, c+=3) { array[c]=s[b]; array[c+1]='o'; array[c+2]='o'; } array[c]='\0'; return array; } Alignment *thread_dnaseq_on_prot_aln (Sequence *S, Alignment *A) { Alignment *B=NULL; int a, b, c, n, la, ls, ln, m; B=copy_aln ( A, B); B=realloc_aln2 ( B, B->nseq, B->len_aln*3 +1); for ( n=0,a=0; a< A->nseq; a++) { for ( m=0,b=0; b< S->nseq; b++) { if (strm (A->name[a], S->name[b]) ) { m=1; n++; ungap ( S->seq[b]); B->seq_al[a][0]='\0'; for (la=0, ls=0, ln=0; la< A->len_aln; la++) { for (c=0; c< 3; c++) B->seq_al[a][ls++]=(is_gap(A->seq_al[a][la]))?'-':S->seq[b][ln++]; } B->seq_al[a][ls]='\0'; } } if ( m==0) { for (la=0, ls=0, ln=0; la< A->len_aln; la++) { B->seq_al[a][ls++]=A->seq_al[a][la]; B->seq_al[a][ls++]='-'; B->seq_al[a][ls++]='-'; } } } B->len_aln=strlen ( B->seq_al[0]); return B; } void thread_seq_struc2aln ( Alignment *A, Sequence *ST) { int a, b, c,d; int len, cons; for ( a=0; a< A->nseq; a++) for ( b=0; b< ST->nseq; b++) { if ( strcmp ( A->name[a], ST->name[b])==0) { ungap (ST->seq[b]); len=strlen(A->seq_al[a]); for ( c=0, d=0; cseq_al[a][c]))A->seq_al[a][c]=ST->seq[b][d++]; } } } cons=name_is_in_list ("Cons", ST->name, ST->nseq, 100); if ( cons!=-1 && A->len_aln==strlen ( ST->seq[cons])) { sprintf (A->name[A->nseq], "Cons"); sprintf (A->seq_al[A->nseq],"%s", ST->seq[cons]); A->nseq++; } } void cache_id ( Alignment *A) { int a, b,n; char r1, r2, r3; for ( a=0; a< A->len_aln; a++) { for ( b=0, n=0; b< A->nseq; b++)if ( !is_gap(A->seq_al[b][a]))n++; for ( b=0; b< A->nseq; b++) if ( !is_gap(A->seq_al[b][a]) && n==A->nseq)A->seq_al[b][a]='h'; else if( !is_gap(A->seq_al[b][a]))A->seq_al[b][a]='x'; } for ( a=0; a< A->nseq; a++) { for ( b=1; b< A->len_aln-1; b++) { r1=A->seq_al[a][b-1]; r2=A->seq_al[a][b]; r3=A->seq_al[a][b+1]; if (r2=='h') { if ( (r1=='h' || r1=='b') && (r3=='h' || r3=='b'))A->seq_al[a][b]='h'; else A->seq_al[a][b]='b'; } } for ( b=1; b< A->len_aln-1; b++)if ( A->seq_al[a][b]=='b')A->seq_al[a][b]='x'; } } /*******************************************************************************************/ /* */ /* */ /* PROCESING OF EST */ /* */ /***************************************************************************************** */ int process_est_sequence ( Sequence *S, int *cluster_list) { char **inverted_seq; int T=20; int a, b; int V1, V2; int **sens; int **a_sens; int **best; int *solution; char buf [VERY_LONG_STRING]; int n_clusters=0; int n; sens=declare_int ( S->nseq,S->nseq); a_sens=declare_int ( S->nseq,S->nseq); best=declare_int ( S->nseq,S->nseq); inverted_seq=vcalloc ( S->nseq, sizeof (char*)); for ( a=0; anseq; a++) inverted_seq[a]=invert_seq ( S->seq[a]); for ( a=0; a< S->nseq-1; a++) { for ( b=a+1; bnseq; b++) { V1=sens[a][b]=sens[b][a]=get_best_match ( S->seq[a], S->seq[b]); V2=a_sens[a][b]=a_sens[b][a]=get_best_match ( S->seq[a],inverted_seq[b]); best[a][b]=best[b][a]=(V1>V2)?V1:V2; } } solution=SHC ( S->nseq, a_sens, sens); for ( a=0; anseq; a++)cluster_list[a]=-1; for ( a=0; anseq; a++) { n=search_for_cluster (a, n_clusters, cluster_list, T, S->nseq, best); if ( n>0)n_clusters++; } fprintf ( stderr, "\nTHERE %s %d Independant Cluster(s) in your sequences",(n_clusters>1)?"are":"is",(n_clusters)); for (a=0; anseq; b++) { if ( cluster_list[b]==a)fprintf ( stderr, "%s ", S->name[b]); } } for ( a=0; anseq; a++) { if ( solution[a]==-1) { S->seq[a]=inverted_seq[a]; sprintf ( buf, "i_%s", S->name[a]); sprintf ( S->name[a], "%s", buf); } } return n_clusters; } int search_for_cluster ( int seq, int cluster_number, int *cluster_list, int T, int nseq, int **S) { int n=0,a; if (cluster_list[seq]==-1) { cluster_list[seq]=cluster_number; n++; } for ( a=0; aT) { n++; cluster_list[a]=cluster_number; n+=search_for_cluster ( a, cluster_number, cluster_list, T, nseq, S); } } return n; } int * SHC ( int nseq, int **NST, int **ST) { int a; int mut; int score, new_score; int N_IT=VERY_LONG_STRING; int *sol; int count; sol=vcalloc ( nseq, sizeof (int)); for ( a=0; a49)?1:-1; score=evaluate_sol (sol, nseq, ST, NST); fprintf ( stderr, "\nI_Score=%d\n", score); N_IT=N_IT*nseq; for ( count=0,a=0; a< N_IT && scorescore) { score=new_score; } else if ( (addrand ((unsigned long)VERY_LONG_STRING))>score) { score=new_score; } else sol[mut]=sol[mut]*-1; if ( count==VERY_LONG_STRING) { count=0; fprintf ( stderr, "\nScore=%d", score); } } fprintf ( stderr, "\nScore=%d\n", score); return sol; } int mutate_sol (int *sol, int nseq) { int n; n=addrand ((unsigned long)nseq); sol[n]=sol[n]*-1; return n; } int evaluate_sol ( int *sol, int nseq, int **ST, int **NST) { static int max_score; int a, b, score=0; if ( max_score==0) { for ( a=0; aNST[a][b])?ST[a][b]:NST[a][b]; } } for ( a=0; al2)?l1:l2; m=declare_int (ml, ml); } else if ( (mll2)?l1:l2; m=declare_int (ml, ml); } for ( a=0; abest)?mdiag[a][0]:best; return best; } int** extract_m_diag_streches ( int ** m, int l1, int l2,char *seq1, char *seq2, int *n_mdiag) { int b, x, y, s1, s2; static int **mdiag; int in; static int max_diag=VERY_LONG_STRING; /* diag[0]=len; diag[1]=x_start; diag[2]=y_start; diag[3]=x_end; diag[4]=y_end; */ if ( mdiag==NULL) mdiag=declare_int ( max_diag, 5); for ( s1=l1-1, s2=0;s20) { if (in==1) mdiag[n_mdiag[0]][0]++; else { mdiag[n_mdiag[0]][0]=1; mdiag[n_mdiag[0]][1]=x; mdiag[n_mdiag[0]][2]=y; in=1; } } else if (in==1) { in=0; mdiag[n_mdiag[0]][3]=x-1; mdiag[n_mdiag[0]][4]=y-1; if ( !is_strech ( "ta", seq1, seq2,mdiag[n_mdiag[0]][0], mdiag[n_mdiag[0]][1],mdiag[n_mdiag[0]][2]))n_mdiag[0]++; } if (n_mdiag[0]==(max_diag-1)) {mdiag=vrealloc (mdiag, (max_diag+VERY_LONG_STRING)*sizeof (int*)); for ( b=max_diag; bT)return 1; } return 0; } /************************************************************************************/ /* */ /* STRUC */ /* */ /* */ /************************************************************************************/ char * oneletaa2threeletaa(char aa); float aa2property (char aa, char *mode); int output_seq2struc(char *outfile, Alignment *A) { FILE *fp1, *fp2; int a,c, l; float v, h, x, y, z, dx, dy, dz; char *s; char *tmpfile1, *tmpfile2; char command[1000]; tmpfile1=vtmpnam(NULL); tmpfile2=vtmpnam(NULL); ungap (A->seq_al[0]); s=A->seq_al[0];l=strlen (s); fp1=vfopen (tmpfile1, "w"); x=y=z=0; for ( a=0; a< l; a++) { h=aa2property ( s[a], "doolittle" ); v=aa2property (s[a], "volume"); /*14.398907: peptide bond length*/ dx=(float)sqrt ((double)(14.398907/(((h*h)/(v*v))+1))); dy=dx*(h/v); dz=0; x+=dx; y+=dy; z+=dz; fprintf (fp1, "ATOM%7d CA %s A%4d%12.3f%8.3f%8.3f 1.00 5.30\n",a+1, oneletaa2threeletaa(s[a]),a+1, x, y, z); } vfclose (fp1); sprintf ( command, "extract_from_pdb -infile %s -force > %s", tmpfile1, tmpfile2); my_system (command); fp1=vfopen (tmpfile2, "r"); fp2=vfopen (outfile, "w"); while ( (c=fgetc(fp1))!=EOF)fprintf (fp2, "%c", c); vfclose (fp1); vfclose (fp2); return 0; } char * oneletaa2threeletaa(char aa) { aa=tolower (aa); if ( aa=='a')return "ALA"; else if ( aa=='r') return "ARG"; else if ( aa=='n') return "ASN"; else if ( aa=='d') return "ASP"; else if ( aa=='c') return "CYS"; else if ( aa=='q') return "GLN"; else if ( aa=='e') return "GLU"; else if ( aa=='g') return "GLY"; else if ( aa=='h') return "HIS"; else if ( aa=='i') return "ILE"; else if ( aa=='l') return "LEU"; else if ( aa=='k') return "LYS"; else if ( aa=='m') return "MET"; else if ( aa=='f') return "PHE"; else if ( aa=='p') return "PRO"; else if ( aa=='s') return "SER"; else if ( aa=='t') return "THR"; else if ( aa=='w') return "TRP"; else if ( aa=='y') return "TYR"; else if ( aa=='v') return "VAL"; else { fprintf ( stderr, "\nERROR: %c is not an amino acid [FATAL::aa2hydropathy::%s]", aa, PROGRAM); myexit (EXIT_FAILURE); return NULL; } return NULL; } float aa2property (char aa, char *mode) { if ( mode==NULL || strm (mode, "doolittle")) { aa=tolower (aa); if ( aa=='i')return 4.5; else if ( aa=='v') return 4.2; else if ( aa=='l') return 3.8; else if ( aa=='f') return 2.8; else if ( aa=='c') return 2.5; else if ( aa=='m') return 1.9; else if ( aa=='a') return 1.8; else if ( aa=='g') return -0.4; else if ( aa=='t') return -0.7; else if ( aa=='w') return -0.9; else if ( aa=='s') return -0.8; else if ( aa=='y') return -1.3; else if ( aa=='p') return -1.6; else if ( aa=='h') return -3.2; else if ( aa=='e') return -3.5; else if ( aa=='q') return -3.5; else if ( aa=='d') return -3.5; else if ( aa=='n') return -3.5; else if ( aa=='k') return -3.9; else if ( aa=='r') return -4.5; else { fprintf ( stderr, "\nERROR: %c is not an amino acid [FATAL::aa2hydropathy::%s]", aa, PROGRAM); myexit (EXIT_FAILURE); } } else if (strm (mode, "volume")) { aa=tolower (aa); if ( aa=='a')return 0.915; else if ( aa=='r') return 2.02; else if ( aa=='n') return 1.35; else if ( aa=='d') return 1.24; else if ( aa=='c') return 1.18; else if ( aa=='q') return 1.61; else if ( aa=='e') return 1.55; else if ( aa=='g') return 0.66; else if ( aa=='h') return 1.67; else if ( aa=='i') return 1.69; else if ( aa=='l') return 1.68; else if ( aa=='k') return 1.71; else if ( aa=='m') return 1.70; else if ( aa=='f') return 2.03; else if ( aa=='p') return 1.29; else if ( aa=='s') return 0.99; else if ( aa=='t') return 1.22; else if ( aa=='w') return 2.37; else if ( aa=='y') return 2.03; else if ( aa=='v') return 1.41; else { fprintf ( stderr, "\nERROR: %c is not an amino acid [FATAL::aa2hydropathy::%s]", aa, PROGRAM); myexit (EXIT_FAILURE); } } else { fprintf ( stderr, "\nERROR: %s is an unknown mode [FATAL::aa2hydropathy::%s]", mode , PROGRAM); myexit (EXIT_FAILURE); } return 0; } /************************************************************************************/ /* */ /* DNA */ /* */ /* */ /************************************************************************************/ Alignment *code_dna_aln (Alignment *A) { int a, b,l,r; for ( a=0; a< A->nseq; a++) { for (l=0, b=0; b< A->len_aln; b++) { r=A->seq_al[a][b]; if ( r=='-')l++; else if ( r=='~')continue; else if ( r=='.')l++; else if ( !islower(r))A->seq_al[a][b]='4'; else { A->seq_al[a][b]=(l+3)%3+'0'; l++; } } } return A; } Alignment *back_translate_dna_aln (Alignment *A) { /*Given a set of aligned sequences starts from left to right 1 aa->3 nuc ambiguities are randomly resolved. returns the corresponding amino acid alignment */ int a; char *seq ; ungap_aln(A); A=realloc_aln (A, 10000); seq=vcalloc ( 10000, sizeof (char)); for ( a=0; a< A->nseq; a++) { seq=back_translate_dna_seq (A->seq_al[a], seq, RANDOM); sprintf ( A->seq_al[a], "%s", seq); } A->len_aln=A->len_aln*3; compress_aln (A); vfree (seq); return A; } char * back_translate_dna_seq ( char *in_seq,char *out_seq, int mode) { int a,len; len=strlen(in_seq); if (out_seq==NULL)out_seq=vcalloc ( len*3+1, sizeof (char)); out_seq[0]='\0'; for (a=0; atype, "DNA") && !strm (S->type, "RNA")) printf_exit (EXIT_FAILURE, stderr, "Sequences should be *RNA* type [FATAL:%s]\n", PROGRAM); for ( a=0; anseq; a++) { for (b=0; bseq[a]); b++) { if ( S->seq[a][b]=='u') S->seq[a][b]='t'; if ( S->seq[a][b]=='U') S->seq[a][b]='T'; } HERE ("%s", S->seq[a]); } return S; } Sequence *dna_seq2rna_seq (Sequence *S) { int a, b; if ( !strm(S->type, "DNA") && !strm (S->type, "RNA")) printf_exit (EXIT_FAILURE, stderr, "Sequences should be *DNA* type (type=%s) [FATAL:%s]\n", PROGRAM, S->type); for ( a=0; anseq; a++) for (b=0; blen[a]; b++) { if ( S->seq[a][b]=='t') S->seq[a][b]='u'; if ( S->seq[a][b]=='T') S->seq[a][b]='U'; } return S; } int get_longest_frame (char *seq, int mode); Alignment *translate_dna_aln (Alignment *A, int frame) { /*Given a set of aligned sequences starts from left to right 3 nuc->1 aa 2nuc+1gap, 1nuc+2gap->3 gaps 1 stop-> 3gaps returns the corresponding amino acid alignment */ int a, b,r; if (frame==3 || frame ==4) { for (a=0; a< A->nseq; a++) { char *d, *buf, f; d=A->seq_al[a]; f=get_longest_frame (d,frame); buf=vcalloc ( strlen (d)+1, sizeof (char)); if ( f<3) { sprintf (buf, "%s", d+f); sprintf (d, "%s", buf); sprintf (A->seq_comment[a], " frame: %d", f); } else if ( f>=3) { f-=3; sprintf ( buf, "%s", d); buf=complement_string (buf); sprintf (d, "%s",buf+f); sprintf (A->seq_comment[a], " frame: %d Reverse Complement", f); } vfree (buf); } } else { for ( a=0; a< A->nseq; a++) for (b=0; b< frame; b++) A->seq_al[a][b]='-'; ungap_aln(A); } for ( b=0; b< A->nseq; b++) for ( a=0; a< A->len_aln;) { r=translate_dna_codon (A->seq_al[b]+a, 'z'); if (is_gap(r)) { A->seq_al[b][a++]='-'; A->seq_al[b][a++]='-'; A->seq_al[b][a++]='-'; } else if ( r=='x') { A->seq_al[b][a++]='o'; A->seq_al[b][a++]='-'; A->seq_al[b][a++]='-'; } else if ( r=='z') { A->seq_al[b][a++]='x'; A->seq_al[b][a++]='-'; A->seq_al[b][a++]='-'; } else { A->seq_al[b][a++]=r; A->seq_al[b][a++]='-'; A->seq_al[b][a++]='-'; } } compress_aln (A); return A; } int get_longest_frame (char *in_seq, int mode) { char *prot, *seq; int a; int max_l=0, l; int best_frame=0; int nf; seq=vcalloc (strlen (in_seq)+1, sizeof (char)); prot=vcalloc (strlen (in_seq)+1, sizeof (char)); sprintf ( seq, "%s", in_seq); if ( mode == 3)nf=3; else if ( mode == 4) nf=6; for (a=0; a=3)?a-3:a; prot=translate_dna_seq ( seq,f,'\0', prot); l=strlen (prot); if (l>=max_l){max_l=l;best_frame=a;} } vfree (seq); vfree (prot); return best_frame; } Alignment *clean_gdna_aln (Alignment *A) { int a, b, c, r1, r2,s, p, n, tn; int *col; static int **mat; Alignment *T=NULL; int **score; char *buffer; /*Viterbi Parameters*/ int AL=0; /*Allowed Transition*/ int F=-1000000; /*Forbiden Transition*/ int SPLICE_PENALTY=100; int ORF1=0, ORF2=1, ORF3=2, NC=3; int state, pstate, best_e, best_pstate_p,best_state_p, best_pstate_v, best_state_v, v; int nstate=4; int **transitions; int e; int **v_tab_p; int **v_tab; int * is_dna; best_state_p=best_state_v=best_pstate_p=best_pstate_v=best_e=0; buffer=vcalloc ( 100000, sizeof (char)); is_dna=vcalloc ( A->nseq, sizeof (int)); score=declare_int ( A->nseq+1, A->len_aln); if ( !mat)mat=read_matrice("pam250mt"); T=copy_aln (A, T); col=vcalloc ( A->nseq, sizeof (int)); for (a=0; a<= A->len_aln; a++) for ( b=0; b< A->nseq; b++){A->seq_al[b][a]=tolower(A->seq_al[b][a]); A->seq_al[b][a]=(A->seq_al[b][a]=='t')?'u':A->seq_al[b][a];} for ( a=0; a< A->nseq; a++) { sprintf ( buffer, "%s", A->seq_al[a]); ungap (buffer); is_dna[a]=strm ( get_string_type (buffer), "DNA"); } for (a=0; a< A->len_aln-2; a++) { for (b=0; b< A->nseq; b++) { if (is_dna[b])col[b]=translate_dna_codon (A->seq_al[b]+a, 'x'); else col[b]=tolower ( A->seq_al[b][a]); } for (n=0,tn=0,b=0; b< A->nseq; b++) for ( c=b; c< A->nseq; c++ ) { r1=col[b]; r2=col[c]; if (r1=='x' || r2=='x'){score[A->nseq][a]=F;break;} else if (r1=='-' && r2=='-'); else if (r1=='-' || r2=='-'); else { if ( is_dna[b] && is_dna[c])score[A->nseq][a]+= mat[r1-'A'][r2-'A']; else score[A->nseq][a]+=mat[r1-'A'][r2-'A']* (A->nseq*A->nseq); } n+=( !is_gap(r1) && !is_gap(r2)); score[A->nseq][a]=(((tn!=0)?score[A->nseq][a]/tn:0)); } } /*initialisation*/ transitions=declare_int ( nstate, nstate); v_tab=declare_int ( A->len_aln+2, nstate ); v_tab_p=declare_int ( A->len_aln+2, nstate ); for (a=0; anseq; s++) { for ( p=0; p<=A->len_aln; p++){for (state=0; state< nstate; state++)v_tab_p[p][state]=-1; } for (p=1+2; p<= A->len_aln; p++) { for (state=0; state< nstate; state++) { if ( state==NC){e=-best_e;} else { e=score[A->nseq][(p-1)-state]; if ( state==0)best_e=e; else best_e=MAX(e, best_e); } for ( pstate=0; pstatebest_pstate_v) ) { best_pstate_v=v; best_pstate_p=pstate; } } v_tab[p][state]=best_pstate_v; v_tab_p[p][state]=best_pstate_p; if (state==0 ||best_pstate_v>best_state_v ) { best_state_p=state; best_state_v=best_pstate_v; } } } for (p=0; p< A->len_aln; p++)T->seq_al[s][p]='.'; for (p=A->len_aln; p>0; p--) { if ( best_state_p==0)T->seq_al[s][p-1]=translate_dna_codon (A->seq_al[s]+(p-1), 'x'); else if ( best_state_p==1 || best_state_p==2)T->seq_al[s][p-1]='-'; best_state_p=v_tab_p[p][best_state_p]; } } vfree (col); return T; } Alignment *clean_cdna_aln (Alignment *A) { /*Given an alignmnet of nucleotides Returns the same alignmnent whith non coding nucleotides replaced with dots at each position, the emission probability is the sum of pair of the substitution of amino-acids */ int a, b, c,s, p; static int **mat; int *emission; float em1, em2; char *buffer; Alignment *B=NULL; /*Viterbi Parameters*/ int AL=0; /*Allowed Transition*/ int F=-1000000; /*Forbiden Transition*/ int PENALTY=30; int NC, C1,C2, C3, START, END; int nstate=0; int state=0,best_state=0, score=0, best_score=0; int p_state; int e=0; int **score_tab; int **state_tab; int **transitions; int n; int r1, r2, r3; NC=nstate++; C1=nstate++; C2=nstate++; C3=nstate++; START=nstate++; END=nstate++; B=copy_aln (A, B); buffer=vcalloc ( 100000, sizeof (char)); emission=vcalloc (A->len_aln, sizeof (int)); if ( !mat) { mat=read_matrice("pam250mt"); } /*Computation of the emission proba for the coding state*/ for (a=0; a< A->len_aln; a++) { /*First component: % occupancy of the column*/ em1=0; for ( b=0; b< A->nseq; b++) em1+=!is_gap(translate_dna_codon (A->seq_al[b]+a, '-')); em1=em1/(float)A->nseq; /*Second Component: % similarity within column*/ em2=0; for (n=0,b=0; b< A->nseq-1; b++) { r1=translate_dna_codon (A->seq_al[b]+a, '-'); for (c=b+1; cnseq; c++) { r2=translate_dna_codon (A->seq_al[c]+a, '-'); if (is_gap(r2) || is_gap(r1)); else { n++; em2+=((mat[r1-'A'][r2-'A'])>1)?1:0; } } } em2=em2/(float)((n==0)?1:n); emission[a]=(em1*100); } /*initialisation*/ transitions=declare_int ( nstate, nstate); score_tab=declare_int ( A->len_aln+2, nstate ); state_tab=declare_int ( A->len_aln+2, nstate ); for (a=0; anseq; s++) { for ( p=0; p<=A->len_aln; p++){for (state=0; state< nstate; state++){score_tab[p][state]=F;state_tab[p][state]=-1;} } score_tab[0][START]=0; for (p=1; p<= A->len_aln; p++) { for (state=0; state< nstate; state++) { if ( state==START || state==END)continue; else if ( state==NC) e=-10; else if ( state==C1) { e=emission[p-1]; } else if ( state ==C2) { if ( p-2<0)e=F; else e=emission[p-2]; } else if ( state==C3) { if ( p-3<0)e=F; else e=emission[p-3]; } for (p_state=0; p_statebest_score){ best_score=score;best_state=p_state;} } score_tab[p][state]=best_score; state_tab[p][state]=best_state; } } best_score=best_state=UNDEFINED; for (state=0; statebest_score) { best_score=score_tab[p-1][state]+e; best_state=state; } } for (p=A->len_aln; p>0;) { B->seq_al[s][p-1]=best_state+'0'; best_state=state_tab[p][best_state]; p--; } } for ( a=0; a< A->nseq; a++) for ( b=0; b< A->len_aln;) { s=B->seq_al[a][b]; if ( s==C1+'0') { r1=A->seq_al[a][b]; r2=A->seq_al[a][b+1]; r3=A->seq_al[a][b+2]; if ( is_gap(r1) ||is_gap(r2) || is_gap(r3)) { A->seq_al[a][b]=(is_gap(r1))?'~':'.'; A->seq_al[a][b+1]=(is_gap(r2))?'~':'.'; A->seq_al[a][b+2]=(is_gap(r3))?'~':'.'; } b+=3; } else if ( s==NC+'0') { A->seq_al[a][b]=(is_gap(A->seq_al[a][b]))?'~':'.'; b++; } else { fprintf (stderr, "\nPROBLEM: [%d %d]->%d", a, b, s-'0'); } } free_aln (B); free_int (transitions, -1); free_int (score_tab, -1); free_int (state_tab, -1); vfree (emission); vfree (buffer); return A; } Alignment *translate_splice_dna_aln (Alignment *A, Alignment *ST) { int a, b, c, r1, r2,s, p, n, tn; int *col; static int **mat; Alignment *T=NULL; int **score; /*Viterbi Parameters*/ int AL=0; /*Allowed Transition*/ int F=-1000000; /*Forbiden Transition*/ int ORF1=0, ORF2=1, ORF3=2,SPL1=3, SPL2=4, SPL3=5, SPL4=6, NC=7; int SPLICE_PENALTY; int frame1, frame2, frame3, best_frame; int nstate=8; char r; int state=0, pstate=0, best_pstate_p=0,best_state_p=0, best_pstate_v=0, best_state_v=0, v=0; int **transitions; int e=0; int **v_tab_p; int **v_tab; score=declare_int ( A->nseq+1, A->len_aln); if ( !mat)mat=read_matrice("pam250mt"); T=copy_aln (A, T); col=vcalloc ( A->nseq, sizeof (int)); for (a=0; a<= A->len_aln; a++) for ( b=0; b< A->nseq; b++){A->seq_al[b][a]=tolower(A->seq_al[b][a]); A->seq_al[b][a]=(A->seq_al[b][a]=='t')?'u':A->seq_al[b][a];} for (a=0; a< A->len_aln-2; a++) { for (b=0; b< A->nseq; b++) { col[b]=translate_dna_codon (A->seq_al[b]+a, 'x'); } for (n=0,tn=0,b=0; b< A->nseq-1; b++) for ( c=b+1; c< A->nseq; c++, tn++ ) { r1=col[b]; r2=col[c]; if (r1=='x' || r2=='x')score[A->nseq][a]=F; else if (r1=='-' && r2=='-'); else if (r1=='-' || r2=='-'); else { score[A->nseq][a]+= mat[r1-'A'][r2-'A']; } n+=( !is_gap(r1) && !is_gap(r2)); } score[A->nseq][a]=(((tn!=0)?score[A->nseq][a]/tn:0)); } /*initialisation*/ transitions=declare_int ( nstate, nstate); v_tab=declare_int ( A->len_aln+2, nstate*nstate); v_tab_p=declare_int ( A->len_aln+2, nstate*nstate); for (a=0; anseq; s++) { for ( p=0; p<=A->len_aln; p++){for (state=0; state< nstate; state++)v_tab_p[p][state]=-1; } for (p=1+2; p<= A->len_aln; p++) { frame1=score[A->nseq][(p-1)]; frame2=score[A->nseq][(p-1)-1]; frame3=score[A->nseq][(p-1)-2]; best_frame=best_int (3, 1, &a, frame1, frame2, frame3); for (state=0; state< nstate; state++) { r=tolower (A->seq_al[s][p-1]); r=(r=='u')?'t':r; if (state==ORF1)e=frame1; else if (state==ORF2)e=frame2; else if (state==ORF3)e=frame3; else if (state==SPL1)e=(r=='g')?best_frame:F; else if (state==SPL2)e=(r=='t')?best_frame:F; else if (state==SPL3)e=(r=='a')?best_frame:F; else if (state==SPL4)e=(r=='g')?best_frame:F; else if (state==NC)e=-best_frame; for ( pstate=0; pstatebest_pstate_v) ){best_pstate_v=v;best_pstate_p=pstate;} } v_tab[p][state]=best_pstate_v; v_tab_p[p][state]=best_pstate_p; if (state==0 ||best_pstate_v>best_state_v ){best_state_p=state; best_state_v=best_pstate_v;} } } for (p=0; p< A->len_aln; p++)T->seq_al[s][p]='.'; for (p=A->len_aln; p>0; p--) { if ( best_state_p==0)T->seq_al[s][p-1]=toupper(translate_dna_codon (A->seq_al[s]+(p-1), 'x')); else if ( best_state_p>=SPL1 && best_state_p<=SPL4)T->seq_al[s][p-1]='-'; best_state_p=v_tab_p[p][best_state_p]; } } vfree (col); return T; } Alignment * mutate_cdna_aln ( Alignment *A) { int a, b, c, n; int n1, n2, r1, r2; int **pos, ps; int neutral_substitution=50; int random_substitution=0; int random_deletion=0; int amino_acid_deletion=0; int amino_acid_substitution=0; char nuc_list[]="agct"; char *new_codon; neutral_substitution=atoi(get_env_variable ("NEUTRAL_SUBSTITUTION",IS_FATAL)); random_substitution =atoi(get_env_variable ("RANDOM_SUBSTITUTION", IS_FATAL)); random_deletion =atoi(get_env_variable ("RANDOM_DELETION", IS_FATAL)); amino_acid_deletion =atoi(get_env_variable ("AMINO_ACID_DELETION", IS_FATAL)); amino_acid_substitution =atoi(get_env_variable ("AMINO_ACID_SUBSTITUTION", IS_FATAL)); if (A->S)free_sequence ( A->S, (A->S)->nseq); A->S=aln2seq(A); addrandinit(time (NULL)); pos=aln2pos_simple ( A, A->nseq); /* 1 Apply neutral substitutions */ if ( neutral_substitution) { for ( c=0; c< neutral_substitution; c++) { for ( a=0; a< A->nseq; a++) { for ( b=0; b< A->len_aln; b++) { if (pos[a][b]<=0)continue; ps=MAX(0,pos[a][b]-(pos[a][b]-1)%3-1); n1=(A->S)->seq[a][pos[a][b]-1]; r1=translate_dna_codon ( (A->S)->seq[a]+ps, 'o'); n2=nuc_list[(int)addrand((unsigned long) 4)]; (A->S)->seq[a][pos[a][b]-1]=n2; r2=translate_dna_codon ( (A->S)->seq[a]+ps, 'o'); if ( r1==r2 && r1!='o')A->seq_al[a][b]=n2; else (A->S)->seq[a][pos[a][b]-1]=n1; } } } } /* 2 Apply substitutions */ if ( random_substitution) { for ( a=0; a< A->nseq; a++) { for ( b=0; b< A->len_aln; b++) { if (pos[a][b]<=0)continue; if (addrand ((unsigned long) 100)>random_substitution)continue; n1=nuc_list[(int)addrand((unsigned long)4)]; (A->S)->seq[a][pos[a][b]-1]=n1; A->seq_al[a][b]=n1; } } } /* 3 Apply amino acid substitutions */ if ( amino_acid_substitution) { for ( a=0; a< A->nseq; a++) { for ( b=0; b< A->len_aln; b+=3) { if (pos[a][b]<=0)continue; if (addrand ((unsigned long) 100)>amino_acid_substitution)continue; ps=MAX(0,pos[a][b]-(pos[a][b]-1)%3-1); r1=translate_dna_codon ( (A->S)->seq[a]+ps, 'o'); new_codon=mutate_amino_acid(r1, "clustalw_col"); for ( c=ps; cS)->seq[a][c]=new_codon[c-ps]; } for ( b=0; b< A->len_aln; b++) { if (pos[a][b]<=0)continue; else A->seq_al[a][b]=(A->S)->seq[a][pos[a][b]-1]; } } } /* 3 Apply amino acid deletions */ if ( amino_acid_deletion) { for ( a=0; a< A->nseq; a++) { for ( b=0; b< A->len_aln; b+=3) { if (pos[a][b]<=0)continue; if (addrand ((unsigned long) 1000)>amino_acid_deletion)continue; ps=MAX(0,pos[a][b]-(pos[a][b]-1)%3-1); n=addrand ((unsigned long) 4)+1; for ( c=ps; clen_aln; c++)(A->S)->seq[a][c]='-'; } for ( b=0; b< A->len_aln; b++) { if (pos[a][b]<=0)continue; else A->seq_al[a][b]=(A->S)->seq[a][pos[a][b]-1]; } } } /* 4 Apply amino acid insertions */ /*FRAMESHIFT MUTATIONS*/ /* 5 Apply nucleotide deletions*/ if ( random_deletion) { for ( a=0; a< A->nseq; a++) { for ( b=0; b< A->len_aln; b++) { if (pos[a][b]<=0)continue; if (addrand ((unsigned long) 1000)>random_deletion)continue; n1='-'; (A->S)->seq[a][pos[a][b]-1]=n1; A->seq_al[a][b]=n1; } } } /* 6 Apply nucleotide deletions*/ free_int (pos, -1); return A; } Alignment* clean_est ( Alignment *A) { /*Rules are as follow: Internal Gap > 30% Requences ----> - Best Residue < 50% Residues ----> 'N' */ int a, b,c; int best; int tot; for ( a=0; a< A->len_aln; a++) { for (tot=0, b=0; b<4; b++)tot+=(A->P)->count[b][a]; best=best_int (5,1, &c, (A->P)->count[0][a],(A->P)->count[1][a],(A->P)->count[2][a],(A->P)->count[3][a],(A->P)->count[4][a]); if ( tot==0) { fprintf ( stderr, "\nWARNING: POSITION WITH NO INFORMATION [clean_est:%s]", PROGRAM); A->seq_al[0][a]='-'; } else if (((A->P)->count[4][a]*100)/tot >30)A->seq_al[0][a]='-'; else if ( (best*100)/tot<50)A->seq_al[0][a]='n'; } return A; } char **make_symbols ( char *name, int *n) { char **symbol; symbol=declare_char ( STRING, STRING); if ( strcmp (name, "3d_ali")==0) { sprintf ( symbol[0], "gih"); sprintf ( symbol[1], "eb"); sprintf ( symbol[2], "x"); sprintf ( symbol[3], "#l"); n[0]=4; } else if ( strcmp (name, "all")==0) { int a, i; for ( i=0,a=0; a<26; a++) { sprintf ( symbol[i++], "%c%c", 'a'+a, 'a'+a); sprintf ( symbol[i++], "%c%c", 'A'+a, 'A'+a); } sprintf ( symbol[i++], "--"); n[0]=i; } else if ( strcmp (name, "set1")==0) { sprintf ( symbol[0], "ilvmfywhktcagH"); sprintf ( symbol[1], "reqdnsP"); sprintf ( symbol[2], "--"); sprintf ( symbol[3], "#l"); n[0]=4; } else if ( strcmp (name, "set2")==0) { n[0]=0; sprintf ( symbol[n[0]++], "gsacT"); sprintf ( symbol[n[0]++], "ndtvpS"); sprintf ( symbol[n[0]++], "ilkreqL"); sprintf ( symbol[n[0]++], "--"); sprintf ( symbol[n[0]++],"#l"); } else if ( strcmp ( name, "any")==0) { sprintf ( symbol[0], "*x"); n[0]=1; } return symbol; } char * translate_dna_seq_on3frame ( char *dna_seq, char stop, char *prot) { int a, l; char *buf; l=strlen (dna_seq); if ( prot==NULL)prot=vcalloc ( l+2, sizeof (char)); buf=vcalloc (l+4, sizeof (char)); sprintf (buf, "%s", dna_seq); lower_string ( buf); for ( a=0; a< l; a++)buf[a]=(buf[a]=='t')?'u':buf[a]; for (a=0; a< l; a++) prot[a]=translate_dna_codon (buf+a, stop); vfree (buf); prot[a]='\0'; return prot; } char * translate_dna_seq ( char *dna_seq, int frame, char stop, char *prot) { int a, b, l; char *buf; l=strlen (dna_seq); if ( prot==NULL)prot=vcalloc ( l/3 +2, sizeof (char)); buf=vcalloc (l+4, sizeof (char)); sprintf (buf, "%s", dna_seq); lower_string ( buf); for ( a=0; a< l; a++)buf[a]=(buf[a]=='t')?'u':buf[a]; for ( b=0,a=0+frame; a< l; a+=3,b++) prot[b]=translate_dna_codon (buf+a, stop); vfree (buf); prot[b]='\0'; return prot; } char * back_translate_dna_codon ( char aa, int deterministic) { static char *r; int choice; vsrand(0); if ( r==NULL)r=vcalloc (4, sizeof (char)); if (!is_gap(aa))aa=tolower(aa); if (is_gap(aa))sprintf (r, "---"); else if ( aa>=0 && aa<=9) { sprintf (r, "%d%d%d", aa, aa,aa); } else if ( aa>='0' && aa<='9') { sprintf (r, "%c%c%c", aa, aa,aa); } else if ( aa=='a') { choice=(deterministic)?0:rand()%4; if ( choice==0)sprintf (r, "gca"); else if ( choice==1)sprintf (r, "gcg"); else if ( choice==2)sprintf (r, "gcc"); else if ( choice==3)sprintf (r, "gct"); } else if ( aa=='c') { choice=(deterministic)?0:rand()%2; if ( choice==0)sprintf (r, "tgc"); else if ( choice==1)sprintf (r, "tgt"); } else if ( aa=='d') { choice=(deterministic)?0:rand()%2; if ( choice==0)sprintf (r, "gac"); else if ( choice==1)sprintf (r, "gat"); } else if ( aa=='e') { choice=(deterministic)?0:rand()%2; if ( choice==0)sprintf (r, "gaa"); else sprintf (r, "gag"); } else if ( aa=='f') { choice=(deterministic)?0:rand()%2; if ( choice==0)sprintf (r, "ttc"); else sprintf (r, "ttt"); } else if ( aa=='g') { choice=(deterministic)?0:rand()%4; if ( choice==0) sprintf (r, "gga"); else if ( choice==1) sprintf (r, "ggg"); else if ( choice==2) sprintf (r, "ggc"); else if ( choice==3) sprintf (r, "ggt"); } else if ( aa=='h') { choice =rand()%2; if ( choice==0)sprintf (r, "cac"); else sprintf (r, "cat"); } else if ( aa=='i') { choice=(deterministic)?0:rand()%3; if ( choice==0) sprintf (r, "ata"); else if ( choice==1) sprintf (r, "atc"); else if ( choice==2) sprintf (r, "att"); } else if ( aa=='k') { choice=(deterministic)?0:rand()%2; if ( choice==0) sprintf (r, "aaa"); else if ( choice==1) sprintf (r, "aag"); } else if ( aa=='l') { choice=(deterministic)?0:rand()%6; if ( choice==0) sprintf (r, "cta"); else if ( choice==1) sprintf (r, "ctg"); else if ( choice==2) sprintf (r, "ctc"); else if ( choice==3) sprintf (r, "ctt"); else if ( choice==4) sprintf (r, "tta"); else if ( choice==5) sprintf (r, "ttg"); } else if ( aa=='m')sprintf ( r, "atg"); else if ( aa=='n') { choice=(deterministic)?0:rand()%2; if ( choice==0) sprintf (r, "aac"); else if ( choice==1) sprintf (r, "aat"); } else if ( aa=='p') { choice=(deterministic)?0:rand()%4; if ( choice==0) sprintf (r, "cca"); else if ( choice==1) sprintf (r, "ccg"); else if ( choice==2) sprintf (r, "ccc"); else if ( choice==3) sprintf (r, "cct"); } else if ( aa=='q') { choice=(deterministic)?0:rand()%2; if ( choice==0) sprintf (r, "caa"); else if ( choice==1) sprintf (r, "cag"); } else if ( aa=='r') { choice=(deterministic)?0:rand()%6; if ( choice==0) sprintf (r, "cga"); else if ( choice==1) sprintf (r, "cgg"); else if ( choice==2) sprintf (r, "cgc"); else if ( choice==3) sprintf (r, "cgt"); else if ( choice==4) sprintf (r, "aga"); else if ( choice==5) sprintf (r, "agg"); } else if ( aa=='s') { choice=(deterministic)?0:rand()%6; if ( choice==0) sprintf (r, "tca"); else if ( choice==1) sprintf (r, "tcg"); else if ( choice==2) sprintf (r, "tcc"); else if ( choice==3) sprintf (r, "tct"); else if ( choice==4) sprintf (r, "agt"); else if ( choice==5) sprintf (r, "agc"); } else if ( aa=='t') { choice=(deterministic)?0:rand()%4; if ( choice==0) sprintf (r, "aca"); else if ( choice==1) sprintf (r, "acg"); else if ( choice==2) sprintf (r, "acc"); else if ( choice==3) sprintf (r, "act"); } else if ( aa=='v') { choice=(deterministic)?0:rand()%4; if ( choice==0) sprintf (r, "gta"); else if ( choice==1) sprintf (r, "gtg"); else if ( choice==2) sprintf (r, "gtc"); else if ( choice==3) sprintf (r, "gtt"); } else if ( aa=='w') { sprintf (r, "tgg"); } else if ( aa=='y') { choice=(deterministic)?0:rand()%2; if ( choice==0) sprintf (r, "tac"); else if ( choice==1) sprintf (r, "tat"); } else { sprintf (r, "nnn"); } return r; } int translate_dna_codon ( char *sequence, char stop) { char seq[4]; int a,b; if ( (b=strlen (sequence))<3) { for ( a=0; atype, "DNA") || strm(S->type, "RNA"))sprintf (alp, "AGCT"); else if ( strm(S->type, "PROTEIN"))sprintf (alp, "ACDEFGHIKLMNPQRSTVWY"); alp_size=strlen(alp); B=copy_aln (A,NULL); B=realloc_aln(B, B->len_aln*2+1); for ( a=0, b=0; a< A->len_aln; a++, b+=2) { for ( c=0; c< A->nseq; c++) { B->seq_al[c][b]=tolower(A->seq_al[c][a]); B->seq_al[c][b+1]='~'; } } for ( c=0; c< A->nseq; c++)B->seq_al[c][b]='\0'; B->len_aln=A->len_aln*2; tot=n_mut=0; for (a=0; a< B->len_aln; a+=2) for ( b=0; bnseq; b++) { if ( is_gap(B->seq_al[b][a]))continue; mut=((rand()%RAND_MAX)>ratio)?0:1; tot++; n_mut+=mut; if (mut) { type=rand()%2; if (type==0)/*deletion*/ { B->seq_al[b][a]='.'; } else if ( type==1) { B->seq_al[b][a+1]=alp[rand()%alp_size]; } else if (type==2) { B->seq_al[b][a]=alp[rand()%alp_size]; } } } ungap_aln (B); free_sequence (S, S->nseq); free_aln (A); return B; } char* mutate_amino_acid ( char aa, char *mode) { int a, b, c, d; char nucleotide[]="agct"; char amino_acid[]="acdefghiklmnpqrstvwy"; static char **triplet; static char **cw_col; int ng_cw_col; static int **amino_acid_list; static int *lu; char a1, a2; char *mat; aa=tolower(aa); declare_name(mat); if ( !mode)sprintf (mat, "clustalw_col"); else sprintf (mat, "%s", mode); if (!triplet) { triplet=declare_char ( 64, 4); for (d=0, a=0; a< 4;a++) for ( b=0; b< 4; b++) for ( c=0; c< 4; c++, d++) { triplet[d][0]=nucleotide[a]; triplet[d][1]=nucleotide[b]; triplet[d][2]=nucleotide[c]; } } if ( !cw_col)cw_col=make_group_aa ( &ng_cw_col,mat); if ( !amino_acid_list) { amino_acid_list=declare_int ( 20, 65); for ( a=0; a< 20; a++) for ( b=0; b< 64; b++) { a1=translate_dna_codon ( triplet[b], 'x'); a2=amino_acid[a]; for ( d=0; d< ng_cw_col; d++) if ( is_in_set ( a1, cw_col[d]) && is_in_set ( a2, cw_col[d])) { amino_acid_list[a][++amino_acid_list[a][0]]=b; } } lu=vcalloc ( 26, sizeof (int)); for ( a=0; a<20; a++) { lu[amino_acid[a]-'a']=a; } /* for ( a=0; a< 20; a++) { fprintf ( stderr, "\n%c", amino_acid[a]); for ( b=1; b<=amino_acid_list[a][0]; b++) fprintf ( stderr, "\n\t%s %c", triplet[amino_acid_list[a][b]], translate_dna_codon (triplet[amino_acid_list[a][b]], 'x')); } */ } return triplet [addrand((unsigned long)amino_acid_list[lu[aa-'a']][0])+1]; } /**************************************************************************************************/ /******************************** ********************************************/ /******************************** PROCESSING ********************************************/ /******************************** ********************************************/ void modify_data (Sequence_data_struc *D1in, Sequence_data_struc *D2in, Sequence_data_struc *DSTin, char **action_list,int n_actions, Action_data_struc *RAD) { Sequence *COOR=NULL, *NS=NULL,*BUFS=NULL, *OUT_S=NULL; Constraint_list *CL; char *s; int value,upper_value, lower_value, start, end, a, b,c; int *count_table=NULL; char *action; Sequence_data_struc *D1; Sequence_data_struc *D2; Sequence_data_struc *DST; int s1, s2, r1, r2; static int clean_flag; Alignment *BUF; /*Switches*/ action=action_list[0]; if (action[0]=='2') { D1=D2in; D2=D1in; DST=DSTin; action++; } else if ( action[0]=='1') { D1=D1in; D2=D2in; DST=DSTin; action++; } else if ( action[0]=='3') { D1=DSTin; D2=D1in; DST=DSTin; action++; } else { D1=D1in; D2=D2in; DST=DSTin; } if (!D1->A)D1->A=copy_aln (D1in->A, NULL); if ( strm(action, "seqnos")) { (D1->A)->output_res_num=1; } else if ( strm (action,"aln2bootstrap")) { (D1->A)=aln2bootstrap (D1->A, ATOI_ACTION (1)); D1->S=aln2seq (D1->A); } else if ( strm (action,"aln2sample")) { (D1->A)=aln2sample (D1->A, ATOI_ACTION (1)); D1->S=aln2seq (D1->A); } else if ( strm (action,"aln2random_aln")) { (D1->A)=aln2random_aln (D1->A, ACTION (1)); D1->S=aln2seq (D1->A); } else if ( strm (action, "or_scan")) { HERE ("OR SCAN"); D1->A=or_scan(D1->A, D2->A, ACTION(1)); D1->S=aln2seq (D1->A); } else if ( strm (action, "or_sar")) { D1->A=or_sar(D1->A, D2->A, ACTION(1), PRINT); D1->S=aln2seq (D1->A); } else if ( strm ( action, "sar2subsar")) { /*in->sequences in2->sar data */ Alignment *subA, *subS; if ( n_actions==1) { fprintf ( stderr, "\nin=aln, in2=sar sar2subsar [filter value compound1 compound2...] | [jack1] | [file]\n"); myexit (EXIT_FAILURE); } sarset2subsarset ( D1->A, D2->A, &subA, &subS, main_read_aln (action_list[2], NULL)); D1->A=subA;D2->A=subS; } else if ( strm (action, "display_sar")) { D1->A=display_sar (D1->A, D2->A, action_list[1]); } else if ( strm ( action, "sar2simpred")) { /*in->sequences in2->sar data */ sar2simpred ( D1->A, D2->A, action_list[1], action_list[2], atoi(action_list[3]), atoi (action_list[4])); } else if ( strm ( action, "sar2simpred2")) { /*in->sequences in2->sar data */ if ( n_actions!=5) { fprintf ( stderr, "\nERROR: +sar2simpred2 seqnamesfile posfile compound limit"); myexit (EXIT_FAILURE); } sar2simpred2 ( D1->A, D2->A, action_list[1], action_list[2], action_list[3], atoi (action_list[4])); } else if ( strm ( action, "sar_analyze")) { /*in->sequences in2->sar data */ sar_analyze ( D1->A, D2->A,action_list[1]); } else if ( strm ( action, "simple_sar_predict")) { //displays each column with ist score; simple_sar_predict (D1->A, D2->A,ACTION(1)); exit (EXIT_SUCCESS); } else if ( strm ( action, "display_sar_analyze")) { //displays each column with ist score; display_simple_sar_analyze_col (D1->A, D2->A,ACTION(1)); exit (EXIT_SUCCESS); } else if ( strm ( action, "display_sar_analyze_pc")) { //displays each column with ist score; display_simple_sar_analyze_pair_col (D1->A, D2->A,ACTION(1)); exit (EXIT_SUCCESS); } else if ( strm ( action, "weight2sar")) { /*in->sequences in2->sar data */ if ( n_actions!=3) { fprintf ( stderr, "\nERROR: +weight2sar "); myexit (EXIT_FAILURE); } D1->A=weight2sar ( D1->A,D2->A, action_list[1], atoi(action_list[2])); } else if ( strm ( action, "sar_weight")) { /*in->sequences in2->sar data */ if ( n_actions!=3) { fprintf ( stderr, "\nERROR: +sar_weight "); myexit (EXIT_FAILURE); } D1->A=aln2weighted_sar_score ( D1->A,D2->A, action_list[1], action_list[2]); D1->S=aln2seq ( D1->A); } else if ( strm (action, "name2unique_name")) { char *tmp1, *tmp2; char command[1000]; tmp1=vtmpnam (NULL); tmp2=vtmpnam (NULL); output_fasta_aln (tmp1,D1->A); free_aln (D1->A);free_sequence (D1->S, -1); sprintf ( command, "fasta_aln2fasta_aln_unique_name.pl %s >%s", tmp1, tmp2); my_system ( command); D1->S=get_fasta_sequence ( tmp2, NULL); D1->A=seq2aln (D1->S,NULL, 1); } else if ( strm (action, "rm_tag") || strm (action, "rm_template")) { char **temp_name=NULL,**temp_list=NULL, temp_nseq=0; int z; if ( D1 && D1->A){temp_name=(D1->A)->name;temp_nseq=(D1->A)->nseq;} else if ( D1 && D1->S){temp_name=(D1->S)->name;temp_nseq=(D1->S)->nseq;} temp_list=rm_name_tag (temp_name,temp_nseq, NULL); if ( n_actions>1 && strm (action_list[1], "template")) { for ( z=0; zS=seq2template_seq (D1->S, action_list[1], NULL); D1->A=seq2aln(D1->S, NULL, 1); } else if ( strm ( action, "seq2year")) { D1->S=seq2year (D1->S, (n_actions>1)?atoi(action_list[1]):1); D1->A=seq2aln(D1->S, NULL, 1); } else if ( strm (action, "swap_lib_header")) { Sequence *S; S=main_read_seq (action_list[1]); (D1->CL)->S=S; } else if ( strm (action, "weight_lib")) { int l; int w; w=atoi (action_list[1]); if ( D1->CL) { for (l=0; l<(D1->CL)->ne; l++) (D1->CL)->L[l*CL->entry_len+WE]=w; } } else if ( strm (action, "struc2nb")) { int c; for ( c=0; c< (D1->S)->nseq; c++) { struclist2nb ((D1->S)->name[c],(D1->S)->seq[c], (D1->S)->seq_comment[c], atof(action_list[1]),ACTION(2),ACTION(3) ); } myexit (EXIT_SUCCESS); } else if ( strm(action, "seq2contacts")) { int z; D1->S=swap_header (D1->S, D2->S); for ( z=0; z< (D1->S)->nseq; z++)sprintf ( (D1->A)->name[z], "%s", (D1->S)->name[z]); DST->S=seq2contacts (D1->S, atof (action_list[1])); DST->A=copy_aln (D1->A, NULL); thread_seq_struc2aln ( DST->A,DST->S); for (z=0; z< (D1->S)->nseq; z++) (DST->A)->S=D1->S; } else if ( strm(action, "struc2contacts")) { char *seq; if ( atof (action_list[3])>0) { seq=map_contacts (action_list[1], action_list[2], atof (action_list[3])); fprintf ( stderr, "\n>%s %s\n%s",action_list[1], action_list[2],seq); } else print_contacts (action_list[1], action_list[2], atof (action_list[3])); myexit (EXIT_SUCCESS); } else if ( strm(action, "treelist_prune")|| strm(action, "prune_treelist")) { Sequence *TS; if (D2 && D2->S)TS=D2->S; else TS=treelist2sub_seq((D1->S),ATOI_ACTION(1)); treelist2prune_treelist ( D1->S,TS, NULL); D1->A=seq2aln (D1->S, NULL, NO_PAD); } else if ( strm (action, "tree2unresolved_nodes")) { int ns; int *l; ns=tree2nseq (D1->T); l=vcalloc (ns, sizeof (int)); tree2nnode_unresolved (D1->T, l); for ( a=0; aT=main_prune_tree ( D1->T, D2->S); } else if ( strm ( action, "tree2seq")) { D1->S=tree2seq(D1->T, NULL); D1->A=seq2aln (D1->S, D1->A, 1); (D1->A)->len_aln=1; for ( a=0; a< (D1->A)->nseq; a++)sprintf ( (D1->A)->seq_al[a], "sequence"); } else if ( strm (action, "seq2dpatree")) { D1->T= seq2dpa_tree(D1->S,"ktup"); } else if ( strm (action, "tree2dpatree")) { D1->T= tree2dpa_tree(D1->T,(D2 && D2->A)?D2->A:D1->A, (n_actions==1)?"idmat":action_list[1]); } else if ( strm (action, "tree2group")) { vfclose (tree2group (D1->T, (tree2seq(D1->T,NULL)), atoi(action_list[1]), atoi(action_list[2]),(n_actions==4)?action_list[3]:NULL, stdout)); myexit (EXIT_SUCCESS); } else if ( strm(action, "unroot")) { D1->T=unroot_tree(D1->T); } else if ( strm(action, "treelist2group")|| strm(action, "treelist2groups") ) { Sequence *TS; if (D2 && D2->S)TS=D2->S; else TS=treelist2seq((D1->S)); treelist2groups (D1->S, TS, ACTION(1), stdout); myexit (EXIT_SUCCESS); // treelist2groups (D1->S,(D2)?D2->S:NULL, ACTION(1), stdout ); //exit (EXIT_SUCCESS); } else if ( strm(action, "splits2tree")) { D1->T=split2tree ((D2)?D2->T:NULL,D1->S, ACTION(1)); } else if ( strm(action, "count_splits")) { count_splits ((D2)?D2->T:NULL,D1->S, ACTION(1)); exit (EXIT_SUCCESS); } else if ( strm(action, "count_groups")) { count_tree_groups (D1->S, ACTION(1)); } else if ( strm (action, "tree2dist")) { int ta, tb, ***td; Sequence *TS; TS=(D2)?D2->S:NULL; td=tree2dist (D1->T,TS, NULL); if (!TS)TS=tree2seq(D1->T, NULL); for (ta=0; tanseq; ta++) { fprintf ( stdout, "%-15s ",TS->name[ta]); for ( tb=0; tbnseq; tb++) { int n=0; if ( ACTION(1) && strm (ACTION(1), "length"))n=1; fprintf (stdout, " %4d", td [n][ta][tb]); } fprintf ( stdout, "\n"); } exit (EXIT_SUCCESS); } else if ( strm (action, "treelist2lti")) { Sequence *TS; if (D2 && D2->S)TS=D2->S; else TS=treelist2sub_seq((D1->S),ATOI_ACTION(2)); treelist2lti (D1->S,TS, (int)ATOI_ACTION(1), stdout ); exit (0); } else if ( strm (action,"treelist2frame")) { Sequence *TS; if (D2 && D2->S)TS=D2->S; else TS=treelist2sub_seq((D1->S),ATOI_ACTION(1)); treelist2frame (D1->S, TS); myexit (EXIT_SUCCESS); } else if ( strm (action, "treelist2seq")) { D1->S=treelist2sub_seq (D1->S,ATOI_ACTION(1)); D1->A=seq2aln(D1->S, NULL, 1); } else if ( strm (action, "treelist2leafgroup")) { treelist2leafgroup (D1->S, (D2)?D2->S:NULL, ACTION(1)); exit (0); } else if ( strm(action, "treelist2splits")) { if (D1->T)D1->S=add_file2file_list ((D1->T)->file, NULL); treelist2splits (D1->S, (D2)?D2->S:NULL); } else if ( strm(action, "treelist2dmat")) { treelist2dmat (D1->S); } else if ( strm(action, "tree_cmp") || strm (action, "tree_compare")) { D1->T=main_compare_trees ( D1->T, D2->T, stdout); } else if ( strm (action, "tree_scan")) { D1->T=tree_scan (D1->A, D2->T, ACTION(1), ACTION(2)); } else if ( strm (action, "split_cmp")) { main_compare_splits (D1->T, D2->T, ACTION(1), stdout); } else if ( strm(action, "node_sort")) { node_sort ( action_list[1], D1->T); exit (EXIT_SUCCESS); } else if ( strm ( action, "avg_bootstrap")) { display_avg_bootstrap (D1->T); myexit (EXIT_SUCCESS); } else if ( strm (action, "tree_cog_cmp")) { main_compare_cog_tree (D1->T,action_list[1]); exit (EXIT_SUCCESS); } else if ( strm (action, "tree_aln_cmp")) { main_compare_aln_tree (D1->T, D2->A, stdout); exit (EXIT_SUCCESS); } else if ( strm(action, "change_bootstrap")) { D1->T=reset_boot_tree ( D1->T, (n_actions>=2)?atoi(action_list[1]):0); } else if ( strm(action, "change_distances")) { D1->T=reset_dist_tree ( D1->T, (n_actions>=2)?atof(action_list[1]):0.00); } else if ( strm(action, "aln2tree")) { D1->T=tree_compute (D1->A, n_actions-1, action_list+1); } else if ( strm(action, "similarities2tree")) { D1->T=similarities_file2tree (ACTION(1)); } else if ( strm(action, "original_seqnos")) { (D1->A)->output_res_num=2; } else if ( strm (action, "aln2pred")) { aln2pred (D1->A, D2->A, ACTION (1)); exit (EXIT_SUCCESS); } else if ( strm(action, "evaluate")) { Alignment *A; DST->A=copy_aln (D1->A, NULL); DST->S=aln2seq(DST->A); if (n_actions>1 && strm ( action_list[1], "categories")) { CL=declare_constraint_list ( DST->S,NULL, NULL, 0,NULL, read_matrice("pam250mt")); DST->A= main_coffee_evaluate_output(DST->A, CL, "categories"); } else if (n_actions>1 && strm ( action_list[1], "sar")) { CL=declare_constraint_list ( DST->S,NULL, NULL, 0,NULL, read_matrice("pam250mt")); DST->A= main_coffee_evaluate_output(DST->A, CL, "sar"); } else if (n_actions>1 && strstr ( action_list[1], "boxshade")) { char color_mode[1000]; sprintf (color_mode,"boxshade_%d", atoi(ACTION2(2,"30"))); CL=declare_constraint_list ( DST->S,NULL, NULL, 0,NULL, read_matrice("pam250mt")); DST->A= main_coffee_evaluate_output(DST->A, CL, color_mode); } else { CL=declare_constraint_list ( DST->S,NULL, NULL, 0,NULL, read_matrice((n_actions==1)?"pam250mt":action_list[1])); DST->A= main_coffee_evaluate_output(DST->A, CL, "matrix"); } DST->S=aln2seq ( DST->A); A=D1->A; sprintf ( A->name[A->nseq], "cons"); sprintf ( A->seq_al[A->nseq], "%s", aln2cons_seq_mat (A, "idmat")); } else if ( strm (action, "sp_evaluate")) { fprintf ( stdout, "SP Score: %.2f", sum_pair ((DST && DST->A)?DST->A:D1->A,ACTION(1),atoi(ACTION2(2,"0")),atoi(ACTION2(3,"0")))); exit (EXIT_SUCCESS); } else if ( strm (action, "lat_evaluate")) { float score; score=lat_sum_pair ( D1->A, action_list[1]); fprintf ( stdout, "\nLAT_SCORE: %.2f", score); exit (EXIT_SUCCESS); } else if ( strm (action, "add_scale")) { D1->A=aln2scale (D1->A, ACTION(1)); } else if ( strm (action, "RNAfold_cmp")) { D1->A=compare_RNA_fold (D1->A, D2->A); } else if ( strm (action, "aln2alifold")) { D1->A=aln2alifold (D1->A); D1->S=aln2seq ( D1->A); } else if ( strm (action, "add_alifold")) { D1->A=add_alifold2aln (D1->A, (D2)?D2->A:NULL); } else if ( strm (action, "alifold2analyze")) { D1->A=alifold2analyze (D1->A, (D2)?D2->A:NULL, ACTION(1)); D1->S=aln2seq(D1->A); } else if ( strm (action, "aln2conservation")) { D1->A=aln2conservation ( D1->A, ATOI_ACTION (1), ACTION (2)); exit (EXIT_FAILURE); } else if ( strm (action, "aln2cons")) { char *cons_seq; char *cons_name; cons_name=vcalloc (100, sizeof (char)); sprintf(cons_name, "%s", (n_actions<=2)?"Cons":action_list[2]); cons_seq=aln2cons_seq_mat (D1->A, (n_actions==1)?"blosum62mt":action_list[1]); free_aln (D1->A);free_sequence(D1->S, -1); D1->S=fill_sequence_struc (1, &cons_seq, &cons_name); /*keep the gaps*/ (D1->S)->len[0]=strlen (cons_seq); sprintf ( (D1->S)->seq[0], "%s", cons_seq); D1->A=seq2aln (D1->S, NULL, KEEP_GAP); vfree (cons_name);vfree (cons_seq); } else if ( strm (action, "seq2filter")) { D1->S=seq2filter ( D1->S, atoi(action_list[1]), atoi(action_list[2])); } else if ( strm (action, "aln2resindex")) { //-in: aln, file: ref_seq ref_res target_seq //-in2 target sequences aln2resindex (D1->A, (D2)?D2->A:NULL, stdout); exit (EXIT_SUCCESS); } else if (strm(action, "keep_name")) { RAD->keep_name=1-RAD->keep_name; } else if (strm(action, "use_consensus") ||strm(action, "use_cons") ) { RAD->use_consensus=1-RAD->use_consensus; } else if ( strm(action, "ungap")) { seq2aln (D1->S, D1->A, 1); } else if ( strm2(action, "rmgap", "rm_gap")) { ungap_aln_n (D1->A, (n_actions==1)?100:atoi(action_list[1])); free_sequence ( D1->S, (D1->S)->nseq); D1->S=aln2seq ( D1->A); (D1->A)->S=D1->S; } else if ( strm(action, "rmgap_col")) { D1->A=remove_gap_column ( D1->A,action_list[1]); } else if ( strm(action,"random")) { D1->A= make_random_aln(NULL,(n_actions==1)?1:atoi(action_list[1]),(n_actions==2)?100:atoi(action_list[2]),"acdefghiklmnpqrstvwy"); D1->S=aln2seq ( D1->A); } else if ( strm(action, "landscape")) { set_landscape_msa ((n_actions==1)?0:atoi(action_list[1])); } else if ( strm(action, "clean_maln")) { if ( !DST) { fprintf ( stderr,"\n[You Need an evaluation File: Change the output format][FATAL:%s]\n", PROGRAM); myexit(EXIT_FAILURE); } (DST->A)=aln2number (DST->A); D1->A=clean_maln(D1->A, DST->A,(n_actions==1)?1:atoi(action_list[1]),(n_actions==1)?1:atoi(action_list[2])); } else if ( strm (action, "extract")) { COOR=get_pir_sequence (RAD->coor_file, NULL); D1->S=extract_sub_seq ( COOR, D1->S); free_aln (D1->A); D1->A=declare_Alignment(D1->S); seq2aln (D1->S, D1->A, RAD->rm_gap); free_sequence (COOR, COOR->nseq); } else if ( strm (action, "reorder_column")) { Alignment *RO1, *RO2; Sequence *OUT_S; int s; RO1=rotate_aln (D1->A,NULL); if (ACTION(1) && strm (ACTION(1), "tree")) { D1->T=tree_compute (RO1,n_actions-2, action_list+2); OUT_S=tree2seq(D1->T, NULL); RO1=reorder_aln(RO1, OUT_S->name, OUT_S->nseq); } else if ( ACTION(1) && strm (ACTION(1), "random")) { RO1=reorder_aln ( RO1, NULL, RO1->nseq); } RO2=rotate_aln (RO1, NULL); for (s=0; s< RO2->nseq; s++) sprintf ( RO2->name[s], "%s", (D1->A)->name[s]); free_aln (RO1); free_aln (D1->A); D1->A=RO2; D1->S=aln2seq(D1->A); } else if ( strm (action, "reorder")) { if ( n_actions==2 && strm (action_list[1], "random")) { D1->A=reorder_aln ( D1->A, NULL, (D1->A)->nseq); } else if (n_actions==2 && strm (action_list[1], "scramble")) { D1->A=aln2scramble_seq(D1->A); } else if ( n_actions==2 && strm (action_list[1], "tree")) { OUT_S=tree2seq (D2->T, NULL); D1->A=reorder_aln(D1->A, OUT_S->name, OUT_S->nseq); free_sequence (D1->S,(D1->S)->nseq); D1->S=aln2seq (D1->A); } else { (D2->A)->S=aln2seq (D2->A); (D1->A)->S=aln2seq (D1->A); OUT_S=trim_aln_seq_name(D2->A, D1->A); D1->A=reorder_aln(D1->A, OUT_S->name, OUT_S->nseq); free_sequence (D1->S,(D1->S)->nseq); D1->S=aln2seq (D1->A); } } else if ( strm (action, "cat_aln")) { /*D1->A=aln_cat ( D1->A, D2 ->A);*/ if (D2 && D2->A && !ACTION(1)) D1->A=concatenate_aln (D1->A, D2->A, ACTION(1)); else if (ACTION(1) && is_aln(ACTION(1))) { Alignment *B; int n=1; while (ACTION(n)) { B=main_read_aln (ACTION(n), NULL); D1->A=concatenate_aln (D1->A, B, NULL); n++; } D1->S=aln2seq(D1->A); } else { Alignment *A, *B; A=main_read_aln ((D1->A)->name[0], NULL); for ( a=1; a<(D1->A)->nseq; a++) { B=main_read_aln ((D1->A)->name[a], NULL); A=concatenate_aln (A, B, ACTION(1)); } D1->A=A; D1->S=aln2seq(D1->A); } } else if ( strm ( action, "msalist2cat_pwaln")) { int a, b, c; int sim, min, max; if (n_actions!=3) { min=0; max=100; } else { min=atoi(action_list[1]); max=atoi(action_list[2]); } fprintf ( stdout, ">A\n"); for (a=0;a<(D1->S)->nseq; a++) { Alignment *A; HERE ("process %s", (D1->S)->name[a]); A=main_read_aln((D1->S)->name[a],NULL); for (b=0; bnseq-1; b++) { for ( c=b+1; cnseq; c++) { sim=get_seq_sim (A->seq_al[b], A->seq_al[c], "-", ""); if (sim>=min && sim<=max)fprintf (stdout, "xxx%s", A->seq_al[b]); } } free_aln (A); } fprintf ( stdout, "\n>B\n"); for (a=0;a<(D1->S)->nseq; a++) { Alignment *A; HERE ("process %s", (D1->S)->name[a]); A=main_read_aln((D1->S)->name[a],NULL); for (b=0; bnseq-1; b++) { for ( c=b+1; cnseq; c++) { sim=get_seq_sim (A->seq_al[b], A->seq_al[c], "-", ""); if (sim>=min && sim<=max)fprintf (stdout, "xxx%s", A->seq_al[c]); } } free_aln (A); } fprintf ( stdout, "\n"); exit (EXIT_SUCCESS); } else if ( strm (action, "collapse_tree")) { D1->T=tree2collapsed_tree (D1->T, n_actions-1, action_list+1); } else if ( strm (action, "collapse_aln")) { D1->A=aln2collapsed_aln (D1->A, n_actions-1, action_list+1); } else if ( strm (action, "extract_aln")) { D1->A=aln2sub_aln_file (D1->A, n_actions-1, action_list+1); myexit (EXIT_SUCCESS); } else if ( strm (action, "remove_aa")) { int pos,len, n; pos=atoi(action_list[1]); len=atoi(action_list[2]); n=atoi (action_list[3]); if ( atoi (action_list[4])==1)len=-len; if (pos && n>1) { fprintf ( stderr, "\nWARNING: rm_aa, position (pos) and iteration number (n) simulatneously defined. Iteration number reset to 1 [%s]\n", PROGRAM); n=1; } for ( a=0; a< n; a++) D1->A=probabilistic_rm_aa (D1->A, pos, len); } else if ( strm (action, "remove_nuc")) { int pos; pos=atoi(action_list[1]); if ( pos>3 || pos<1) printf_exit (EXIT_FAILURE, stderr, "Remove_nuc: indicate a number between 1 and 3\n"); pos--; for ( c=0,a=0; a<(D1->A)->len_aln; a++, c++) { if (c==3)c=0; for (b=0; b<(D1->A)->nseq; b++) { if (c==pos) { (D1->A)->seq_al[b][a]='-'; } } } D1->S=aln2seq (D1->A); } else if (strm ( action, "conserved_positions")) { Alignment *A; int a, b, c; int *cache=NULL; A=D1->A; for ( a=0; a< A->nseq && !cache; a++) { if ( strm (action_list[1], A->name[a])) { cache=vcalloc ( A->len_aln+1, sizeof (int)); for ( c=0,b=0; blen_aln; b++) { if ( is_gap (A->seq_al[a][b]))cache[b]=-1; else cache[b]=++c; } } } for ( a=0; a< A->len_aln; a++) { r1=A->seq_al[0][a]; if ( is_gap(r1))continue; for ( c=0,b=0; bnseq; b++) { r2=A->seq_al[b][a]; c+=(r1==r2)?1:0; } if ( (c*100)/A->nseq>=atoi(action_list[2])) fprintf ( stdout, "COL: %d Res: %c %s %d\n", a+1, r1, action_list[1], cache[a]+atoi(action_list[3])); } exit (EXIT_FAILURE); } else if (strm ( action, "extract_block") ) { BUF=copy_aln (D1->A, NULL); if ( check_file_exists(action_list[1])) BUF=extract_aln3(BUF,action_list[1]); else BUF=extract_aln2(BUF,atoi(action_list[2]),atoi(action_list[3]),action_list[1]); D1->A=copy_aln (BUF,D1->A); } else if ( strm ( action, "extract_pos_list")) { D1->A=alnpos_list2block (D1->A, n_actions-1, action_list+1); } else if ( strm ( action, "seq2msa")) { D1->A=simple_progressive_aln ( D1->S, NULL, NULL, action_list[1]); } else if ( strm ( action, "realign_block") ) { D1->A=realign_block ( D1->A, atoi (action_list[1]), atoi (action_list[2]), (n_actions==4)?action_list[3]:NULL); } else if ( strm (action, "extract_seq")) { int is_file; if ( check_file_exists (action_list[1])) { is_file=1; BUFS=main_read_seq (action_list[1]); action_list=BUFS->name; n_actions=BUFS->nseq; } else { is_file=0; action_list++; n_actions--; } for ( a=0; a< n_actions;) { s=action_list[a]; if ( n_actions==1 || is_file==1) { start=1; end=0; a+=1; } else { start=(strm2 (s,"#","*"))?1:(atoi(action_list[a+1])); end= (strm2 (action_list[a+2],"#","*"))?0:(atoi(action_list[a+2])); a+=3; } if ( strm2 (s, "#", "*")) { OUT_S=extract_one_seq((D1->A)->name[0],start, end, D1->A, RAD->keep_name); for (b=1; b< (D1->A)->nseq; b++) { NS=extract_one_seq((D1->A)->name[b],start, end, D1->A, RAD->keep_name); if (count_n_res_in_array(NS->seq[0], -1)) OUT_S=add_sequence ( NS,OUT_S, 0); } } else { if ( a==1)OUT_S=extract_one_seq(s,start, end, D1->A, RAD->keep_name); else { NS=extract_one_seq(s,start, end, D1->A, RAD->keep_name); OUT_S=add_sequence ( NS,OUT_S, 0); } } } D1->S=OUT_S; free_aln (D1->A); D1->A=declare_Alignment(D1->S); seq2aln (D1->S, D1->A, RAD->rm_gap); } else if ( strm (action, "extract_seq_list")) { if ( check_file_exists (action_list[1])) { BUFS=main_read_seq (action_list[1]); action_list=BUFS->name; n_actions=BUFS->nseq; } else { action_list++; n_actions--; } for ( a=0; a< n_actions;a++) { NS=extract_one_seq(action_list[a],1,0, D1->A, KEEP_NAME); OUT_S=add_sequence ( NS,OUT_S, 0); } D1->S=OUT_S; free_aln (D1->A); D1->A=declare_Alignment(D1->S); seq2aln (D1->S, D1->A, RAD->rm_gap); } else if ( strm (action, "remove_seq") || strm (action, "rm_seq")) { char *buf; char **list; int n; int l; list=declare_char ((D1->S)->nseq, 200); buf=vcalloc ((D1->S)->max_len+1, sizeof (char)); for ( n=0,a=0; a< (D1->A)->nseq; a++) { sprintf (buf, "%s", (D1->S)->seq[a]); ungap (buf); l=strlen(buf); for (c=1, b=1; b< n_actions; b++) { if ( strm (action_list[b], (D1->S)->name[a])){(D1->S)->seq[a]=NULL;break;} else if ( strm (action_list[b], "empty") && l==0) { fprintf ( stderr, "WARNING: Sequence %s does not contain any residue: automatically removed from the set [WARNING:%s]\n",(D1->S)->name[a], PROGRAM); (D1->S)->seq[a]=NULL;break; } else if ( strm (action_list[b], "unique")) { if ( name_is_in_list ((D1->S)->name[a], list,n, 100)!=-1) { (D1->S)->seq[a]=NULL;break; } else { sprintf ( list[n++], "%s", (D1->S)->name[a]); } } } } D1->S=duplicate_sequence (D1->S); free_aln (D1->A); free_char ( list, -1); D1->A=declare_Alignment(D1->S); seq2aln (D1->S, D1->A, RAD->rm_gap); } else if ( strm (action, "aln2overaln")|| strm (action,"overaln_param")) { //mode (lower|number|uanlign) Penalty (0-100) Thresold (0-9) int p1,p2,p3,f, t; char *s; int eb=0; char clean_mode[100]; OveralnP *F; F=vcalloc (1, sizeof (OveralnP)); if ( D2 && D2->A) { D1->A=mark_exon_boundaries (D1->A, D2->A); eb=1; } else if ( get_string_variable ("exon_boundaries")) { Sequence *S; Alignment *EB; EB=seq2aln(S=main_read_seq(s),NULL, 0); D1->A=mark_exon_boundaries (D1->A, EB); free_sequence (S, S->nseq); free_aln (EB); eb=1; } if (ACTION(1)==NULL)sprintf (F->mode, "lower"); else if (strstr (ACTION(1), "h")) { fprintf ( stdout, "aln2unalign lower|number|unalign F P1 P2 P3 T\n"); exit (EXIT_SUCCESS); } else sprintf (F->mode, "%s", ACTION(1)); F->t=ATOI_ACTION(2); F->f=ATOI_ACTION(3); F->p1=ATOI_ACTION(4); F->p2=ATOI_ACTION(5); F->p3=ATOI_ACTION(6); F->p3=ATOI_ACTION(7); if (int_variable_isset ("overaln_target"))f=get_int_variable ("overaln_target"); if (int_variable_isset ("overaln_threshold"))t=get_int_variable ("overaln_threshold"); if (eb)sprintf (F->model, "fsa2"); else sprintf (F->model, "fsa1"); D1->A=aln2clean_pw_aln (D1->A, F); } else if ( strm (action,"aln2unalign")) { Alignment *SA; Sequence *SS; SS=aln2seq(SA); SA=copy_aln (D1->A, NULL); thread_seq_struc2aln (SA, SS); D1->A=unalign_aln (D1->A,SA, ATOI_ACTION(1)); D1->S=aln2seq ( D1->A); } else if ( strm (action, "clean_cdna")) { Alignment *A; A=D1->A; for (a=0; a< A->nseq; a++) { char *d, *buf, f; d=A->seq_al[a]; f=get_longest_frame (d, 3); buf=vcalloc ( strlen (d)+1, sizeof (char)); sprintf (buf, "%s", d+f); sprintf (d, "%s", buf); vfree (buf); } } else if ( strm (action, "clean_cdna2")) { D1->A=clean_cdna_aln ( D1->A); free_sequence ( D1->S, (D1->S)->nseq); D1->S=aln2seq ( D1->A); } else if ( strm (action, "aln2short_aln")) { D1->A=aln2short_aln (D1->A, action_list[1], action_list[2], atoi(action_list[3])); free_sequence ( D1->S, (D1->S)->nseq); D1->S=aln2seq ( D1->A); } else if ( strm ( action, "complement")) { D1->A=complement_aln (D1->A); free_sequence ( D1->S, (D1->S)->nseq); D1->S=aln2seq ( D1->A); } else if ( strm ( action, "translate")) { D1->A=translate_dna_aln( D1->A,(n_actions==1)?0:atoi(action_list[1])); free_sequence ( D1->S, (D1->S)->nseq); D1->S=aln2seq ( D1->A); } else if (strm2 ( action, "back_translate","backtranslate")) { D1->A=back_translate_dna_aln( D1->A); free_sequence ( D1->S, (D1->S)->nseq); D1->S=aln2seq ( D1->A); } else if (strm ( action, "rotate")) { D1->A=rotate_aln( D1->A, action_list[1]); free_sequence ( D1->S, (D1->S)->nseq); D1->S=aln2seq ( D1->A); } else if (strm ( action, "invert")) { D1->A=invert_aln( D1->A); free_sequence ( D1->S, (D1->S)->nseq); D1->S=aln2seq ( D1->A); } else if (strm ( action, "code_dna_aln")) { D1->A=code_dna_aln( D1->A); free_sequence ( D1->S, (D1->S)->nseq); D1->S=aln2seq ( D1->A); } else if ( strm ( action, "mutate")) { D1->A=mutate_aln( D1->A,(n_actions==1)?"0":action_list[1]); free_sequence ( D1->S, (D1->S)->nseq); D1->S=aln2seq (D1->A); } else if ( strm ( action, "thread_profile_on_msa")) { (D1->A)->S=NULL; D1->A=thread_profile_files2aln (D1->A, action_list[1], NULL); D1->S=aln2seq(D1->A); } else if ( strm ( action, "thread_dna_on_prot_aln")) { D1->A=thread_dnaseq_on_prot_aln (D1->S, D2->A); free_sequence (D1->S,(D1->S)->nseq); D1->S=aln2seq (D1->A); } else if ( strm ( action, "thread_struc_on_aln")) { thread_seq_struc2aln ( D2->A, D1->S); D1->A=copy_aln(D2->A, D1->A); free_sequence ( D1->S, (D1->S)->nseq); D1->S=aln2seq (D1->A); } else if ( strm (action, "sim_filter")) { D1->A=sim_filter (D1->A, action_list[1], ACTION (2)); free_sequence (D1->S,(D1->S)->nseq); D1->S=aln2seq (D1->A); } else if ( strm (action, "seq2blast")) { D1->A=seq2blast (D1->S); free_sequence (D1->S,(D1->S)->nseq); D1->S=aln2seq (D1->A); } else if ( strm (action, "trim")) { D1->A=simple_trimseq (D1->A,(D2)?D2->A:NULL, action_list[1], ACTION (2)); free_sequence (D1->S,(D1->S)->nseq); D1->S=aln2seq (D1->A); } else if (strm ( action, "trimTC")) { value=(n_actions==1)?10:atoi(action_list[1]); D1->A=tc_trimseq(D1->A,D1->S,action_list[1]); free_sequence (D1->S,(D1->S)->nseq); D1->S=aln2seq (D1->A); } else if (strm ( action, "trimTC2")) { char *group_file; Alignment *B=NULL; char trim_mode[100]; if ( n_actions==1 || !(strm (action_list[1], "NSEQ") ||strm (action_list[1], "MINID")) ) { fprintf ( stderr, "\nTrimTC2 ()\n"); myexit (EXIT_FAILURE); } sprintf (trim_mode, "%s", action_list[1]);action_list+=2; n_actions-=2; if ( strm ( trim_mode, "NSEQ")) { group_file=tree2Ngroup( (D1)?D1->A:NULL, (D2)?D2->T:NULL, atoi (action_list[0]), vtmpnam(NULL), (n_actions==1)?"idmat":action_list[1]); } else { group_file=tree2Ngroup( (D1)?D1->A:NULL, (D2)?D2->T:NULL, -1*atoi (action_list[0]), vtmpnam(NULL), (n_actions==1)?"idmat":action_list[1]); } B=copy_aln (D1->A, B); B=aln2sub_aln_file (B,1,&group_file); B=aln2sub_seq (B, 1, &group_file); D1->A=extract_sub_aln2 (D1->A, B->nseq, B->name); } else if ( strm (action, "chain")) { D1->A=seq2seq_chain (D1->A,D2->A, ACTION(2)); } else if (strm ( action, "master_trim")) { value=(n_actions==1)?10:atoi(action_list[1]); D1->A=master_trimseq(D1->A,D1->S,action_list[1]); free_sequence (D1->S,(D1->S)->nseq); D1->S=aln2seq (D1->A); } else if ( strm (action, "force_aln")) { char ***rlist=NULL; int count=0; if ( n_actions==2) { if (!is_lib_02(action_list[1])) { fprintf ( stderr, "\nERROR: force_aln requires files in TC_LIB_FORMAT_02 [FATAL:%s]", PROGRAM); myexit (EXIT_FAILURE); } else rlist=file2list (action_list[1], " "); } else { rlist=declare_arrayN(3, sizeof (char),3,7, 10); strcat (rlist[1][1],action_list[1]);strcat (rlist[1][3],action_list[2]); strcat (rlist[1][4],action_list[3]);strcat (rlist[1][6],action_list[4]); sprintf ( rlist[2][0], "-1"); } count=1; while (rlist[count] && atoi(rlist[count][0])!=-1) { char st1[100], st2[100], st3[100], st4[100]; sprintf ( st1, "%s", rlist[count][1]);sprintf ( st2, "%s", rlist[count][3]); sprintf ( st3, "%s", rlist[count][4]);sprintf ( st4, "%s", rlist[count][6]); fprintf ( stderr, "\nFORCE: %s %s %s %s", st1, st2, st3, st4); if (is_number (st1))s1=atoi (st1)-1; else s1=name_is_in_list (st1,(D1->A)->name, (D1->A)->nseq, 100); if ( s1<0 || s1>= (D1->A)->nseq)crash ("wrong sequence index"); r1=atoi (st2)-1; if (is_number (st3))s2=atoi (st3)-1; else s2=name_is_in_list (st3,(D1->A)->name, (D1->A)->nseq, 100); if ( s2<0 || s2>= (D1->A)->nseq)crash ("wrong sequence index"); r2=atoi (st4)-1; (D1->A)=add_constraint2aln ((D1->A), s1, r1, s2, r2); count++; } fprintf ( stderr, "\n"); free_arrayN((void*)rlist,3); } else if (strm ( action, "grep")) { D1->A=grep_seq (D1->A, ACTION(1),ACTION(2), ACTION(3)); if (D1->A==NULL) myexit (EXIT_SUCCESS); else D1->S=aln2seq (D1->A); } else if (strm (action, "find")) { int r, l; char *search_string; search_string=vcalloc ( 30, sizeof (char)); if ( strm (action_list[1], "lower"))sprintf ( search_string, "abcdefghijklmnopqrstuvwxyz"); else if ( strm ( action_list[1], "upper"))sprintf ( search_string, "ABCDEFGHIJKLMNOPQRSTUVWXYZ"); else { vfree (search_string);search_string=vcalloc ( strlen (action_list[1])+1, sizeof (char)); sprintf (search_string, "%s", action_list[1]); } for (a=0; a<(D1->A)->nseq; a++) for ( l=0,b=0; b< (D1->A)->len_aln; b++) { r=(D1->A)->seq_al[a][b]; l+=!is_gap(r); if ( r!='\0' && strrchr (search_string, r)) { /*fprintf ( stdout, "%-15s res %c alnpos %4d seqpos %4d\n", (D1->A)->name[a], r, b+1, l);*/ fprintf ( stdout, "%s %d %d\n", (D1->A)->name[a], l, l+1); } } myexit (EXIT_SUCCESS); } else if ( strm (action, "merge_annotation")) { D1->A=merge_annotation (D1->A, DST?DST->A:NULL, ACTION(1)); D1->S=aln2seq (D1->A); } else if ( strm (action, "color_residue")) { int i; Alignment *A; A=D1->A; DST->A=copy_aln (D1->A, NULL); DST->S=aln2seq (DST->A); for (a=0; a< (DST->S)->nseq; a++)ungap ((DST->S)->seq[a]); if (n_actions>2) { for (a=1; aA)->name, (D1->A)->nseq, 100); if (i!=-1) { (DST->S)->seq[i][atoi(action_list[a+1])-1]='0'+atoi(action_list[a+2])-1; } else fprintf (stderr, "\nWARNING: Could not find Sequence %s", action_list[a]); } } else { char name[1000]; int pos, val; FILE *fp; fp=vfopen (action_list[1], "r"); while (fscanf (fp, "%s %d %d\n", name, &pos, &val)==3) { i=name_is_in_list(name, (D1->A)->name, (D1->A)->nseq, 100); if (i!=-1)(DST->S)->seq[i][pos-1]='0'+val; else fprintf (stderr, "\nWARNING: Could not find Sequence %s", action_list[a]); } vfclose (fp); } DST->A=seq2aln (DST->S, NULL, 1); } else if ( strm (action, "edit_residue")) { FILE *fp; int i, pos; int **p; char mod[100], name[100]; Alignment *A; A=D1->A; p=aln2inv_pos (A); if (n_actions>2) { for (a=1; aA)->name, (D1->A)->nseq, 100); if (i!=-1) { pos=atoi(action_list[a+1]); pos=p[i][pos]-1; sprintf (mod, "%s", action_list[a+2]); if ( strm (mod, "upper"))(D1->A)->seq_al[i][pos]=toupper((D1->A)->seq_al[i][pos]); else if ( strm (mod, "lower"))(D1->A)->seq_al[i][pos]=tolower((D1->A)->seq_al[i][pos]); else (D1->A)->seq_al[i][pos]=mod[0]; } else fprintf (stderr, "\nWARNING: Could not find Sequence %s", action_list[a]); } } else { fp=vfopen (action_list[1], "r"); while (fscanf (fp, "%s %d %s\n", name, &pos, mod)==3) { i=name_is_in_list(name, (D1->A)->name, (D1->A)->nseq, 100); if (i!=-1) { pos=p[i][pos]-1; if ( strm (mod, "upper"))(D1->A)->seq_al[i][pos]=toupper(A->seq_al[i][pos]); else if ( strm (mod, "lower"))A->seq_al[i][pos]=tolower(A->seq_al[i][pos]); else A->seq_al[i][pos]=mod[0]; } else fprintf(stderr, "\nWARNING: Could not find Sequence %s", action_list[a]); } vfclose (fp); } D1->S=aln2seq (D1->A); } else if ( strm (action, "clean_flag")) { clean_flag=1-clean_flag; } else if ( strm (action, "aln2case")) { D1->A=aln2case_aln (D1->A, ACTION(1), ACTION(2)); D1->S=aln2seq(D1->A); } else if ( strm5 (action, "convert","upper","lower", "keep", "switchcase")) { b=1; if ( n_actions>1 && is_number (action_list[b])) { lower_value=upper_value=atoi(action_list[b++]); } else if ( n_actions>1 && strm (action_list[b], "gap")) { DST=vcalloc (1,sizeof(Sequence_data_struc)); DST->A=aln2gap_cache (D1->A,0); lower_value=0; upper_value=0; b++; } else if (n_actions>1 && action_list[b] && action_list[b][0]=='[') { lower_value=atoi(strtok (action_list[b]+1, "-[]")); upper_value=atoi(strtok (NULL, "-[]")); b++; } else { lower_value=upper_value=-1; } if ( n_actions >b ||strm (action, "keep") ) { if ( !RAD->symbol_list)RAD->symbol_list=declare_char (STRING, STRING); RAD->n_symbol=0; if ( strm (action, "keep") )sprintf ( RAD->symbol_list[RAD->n_symbol++], "#-"); else { for (a=b; a< n_actions; a++) { sprintf ( RAD->symbol_list[RAD->n_symbol], "%s", action_list[a]); RAD->n_symbol++; } } } for ( value=0; value<=9; value++) { if ( lower_value==-1)value=-1; if ( (value>=lower_value && value<=upper_value)|| value==-1) { if (strm(action,"convert")) D1->A=filter_aln_convert (D1->A, DST?DST->A:NULL,RAD->use_consensus,value,RAD->n_symbol, RAD->symbol_list); else if (strm(action,"upper"))D1->A=filter_aln_lower_upper (D1->A, DST?DST->A:NULL,RAD->use_consensus,value); else if (strm(action,"lower"))D1->A=filter_aln_upper_lower (D1->A, DST?DST->A:NULL,RAD->use_consensus,value); else if (strm(action,"switchcase"))D1->A=filter_aln_switchcase (D1->A, DST?DST->A:NULL,RAD->use_consensus,value); } else { if (strm(action,"keep")) D1->A=filter_aln_convert (D1->A, DST?DST->A:NULL,RAD->use_consensus,value,RAD->n_symbol, RAD->symbol_list); } if (value==-1)break; } /*free_sequence (D1->S,(D1->S)->nseq);*/ if (!D1->S)D1->S=aln2seq (D1->A); } else if ( strm ( action, "count_pairs")) { int a, b,c,v, **matrix; Alignment *A; matrix=declare_int (300,300); A=D1->A; for ( a=0; a< A->nseq-1; a++) for (b=0; b< A->nseq; b++) for (c=0; clen_aln; c++) matrix[(int)A->seq_al[a][c]][(int)A->seq_al[b][c]]++; for ( a=0; a<255; a++) for ( b=a; b<256; b++) { v=matrix[a][b]+matrix[b][a]; if (v)fprintf ( stdout, "\n%c %c %d", a, b, v); } exit (EXIT_SUCCESS); } else if ( strm (action, "count_misc")) { count_misc (D1->A, (!D2)?NULL:D2->A); } else if ( strm (action, "count")) { b=1; if ( n_actions>1 && is_number (action_list[b])) { lower_value=upper_value=atoi(action_list[b++]); } else if (n_actions>1 && action_list[b] && action_list[b] && action_list[b][0]=='[') { lower_value=atoi(strtok (action_list[b]+1, "-[]")); upper_value=atoi(strtok (NULL, "-[]")); b++; } else { lower_value=upper_value=-1; } if ( n_actions >b) { if ( !RAD->symbol_list)RAD->symbol_list=declare_char (STRING, STRING); RAD->n_symbol=0; for (a=b; a< n_actions; a++) { sprintf ( RAD->symbol_list[RAD->n_symbol], "%s", action_list[a]); RAD->n_symbol++; } } for ( value=lower_value; value<=upper_value; value++) { count_table=count_in_aln (D1->A, DST?DST->A:NULL,value,RAD->n_symbol, RAD->symbol_list, count_table); } for ( a=0; an_symbol; a++) { fprintf ( stdout, "%s %d\n", RAD->symbol_list[a], count_table[a]); } free_sequence (D1->S,(D1->S)->nseq); D1->S=aln2seq (D1->A); vfree(count_table); exit(EXIT_SUCCESS); } else if ( strm (action, "msa_weight")) { int random_value; char command [LONG_STRING]; char aln_name[FILENAMELEN]; char tree_name[FILENAMELEN]; char dist_matrix_name[FILENAMELEN]; char weight_name[FILENAMELEN]; char method_4_msa_weights[1000]; if ( n_actions==1) { fprintf ( stderr, "\nError: msa_weight requires a weight_method"); } sprintf ( method_4_msa_weights, "%s", (get_env_variable ("METHOD_4_MSA_WEIGHTS",NO_REPORT))?get_env_variable ("METHOD_4_MSA_WEIGHTS",NO_REPORT):METHOD_4_MSA_WEIGHTS); /*1 Computation of the tree and the distance matrix*/ random_value=addrand ((unsigned long) 100000)+1; sprintf (aln_name, "%d.aln", random_value); sprintf (tree_name, "%d.ph", random_value); sprintf (dist_matrix_name, "%d.dst", random_value); sprintf (weight_name, "%d.weight", random_value); output_fasta_aln (aln_name, D1->A); sprintf ( command, "clustalw -infile=%s -tree -outputtree=dist %s", aln_name, TO_NULL_DEVICE); my_system ( command); sprintf ( command, "%s -method %s -aln %s -tree %s -dmatrix %s -weightfile %s %s",method_4_msa_weights, action_list[1],aln_name, tree_name, dist_matrix_name,weight_name, TO_NULL_DEVICE); my_system ( command); (D1->A)->S=aln2seq (D1->A); ((D1->A)->S)->W=read_seq_weight ( (D1->A)->name, (D1->A)->nseq,weight_name); vremove (weight_name); vremove (aln_name); vremove (tree_name); vremove (dist_matrix_name); } else if ( strm (action, "pavie_seq2random_seq")) { D1->S=pavie_seq2random_seq (D1->S, action_list[1]); D1->A=seq2aln (D1->S,NULL,1); } else if ( strm ( action, "pavie_seq2noisy_seq")) { /* ()*/ D1->S=pavie_seq2noisy_seq (D1->S, atoi(action_list[1]),ACTION(2)); D1->A=seq2aln (D1->S,NULL,1); } else if ( strm (action, "pavie_seq2pavie_mat")) { pavie_seq2trained_pavie_mat ( D1->S, (n_actions==2)?action_list[1]:NULL); myexit (EXIT_SUCCESS); } else if ( strm (action, "pavie_seq2pavie_aln")) { pavie_seq2pavie_aln ( D1->S, action_list[1], ACTION(2)); myexit (EXIT_SUCCESS); } else if ( strm (action, "pavie_seq2pavie_dm")) { if (strstr (ACTION2(2,""), "_MSA_")) D1->S=aln2seq_main(D1->A, KEEP_GAP); pavie_seq2pavie_aln ( D1->S, action_list[1],(n_actions==3)?action_list[2]:"_MATDIST_"); myexit (EXIT_SUCCESS); } else if ( strm (action, "pavie_seq2pavie_msa")) { D1->A=pavie_seq2pavie_msa ( D1->S, action_list[1], (n_actions==3)?action_list[2]:NULL); } else if ( strm (action, "pavie_seq2pavie_tree")) { D1->T=pavie_seq2pavie_tree ( D1->S, action_list[1], (n_actions==3)?action_list[2]:NULL); } else if ( strm (action, "pavie_seq2pavie_sort")) { D1->A=pavie_seq2pavie_sort ( D1->S, action_list[1], (n_actions==3)?action_list[2]:NULL); } else if ( strm (action, "aln2mat_diaa")) { aln2mat_diaa (D1->S); } else if ( strm (action, "aln2mat")) { aln2mat(D1->S); } else if ( strm (action, "seq2latmat")) { seq2latmat ( D1->S, "stdout"); myexit (EXIT_SUCCESS); } else if ( strm (action , "rm_target_pdb")) { int i, j; char *buf; for (i=0; i< (D1->A)->nseq; i++) { j=1;buf=(D1->A)->name[i]; while (buf[j]!='_' && buf[j-1]!='_' && buf[j]!='\0')j++; buf[j]='\0'; } } else if ( strm ( action, "mat2cmp")) { double *r; r=mat2cmp (D1->M, D2->M); fprintf ( stdout, "\nMATRIX COMPARISON: R=%.3f R2=%.3f On %d pairs of values\n", (float)r[0], (float)r[1], (int)r[2]); myexit (EXIT_SUCCESS); } //Special modes else if ( strm ( action, "overaln_list")) { float *re, tre=0,sn, tsn=0, sp, tsp=0; int p1,p2,p3, t, f; FILE *fp; char fname [100]; Alignment **LA; Alignment **LB; HERE ("F P1 P2 P3 T"); t=ATOI_ACTION(1); f=ATOI_ACTION(2); p1=ATOI_ACTION(3); p2=ATOI_ACTION(4); p3=ATOI_ACTION(5); LA=vcalloc ((D1->A)->nseq, sizeof (Alignment*)); LB=vcalloc ((D2->A)->nseq, sizeof (Alignment*)); for (a=0; a<(D1->A)->nseq; a++) { LA[a]=main_read_aln ((D1->A)->name[a], NULL); LB[a]=main_read_aln ((D2->A)->name[a], NULL); } for ( a=0; a<(D1->A)->nseq; a++) { Alignment *A, *B; A=LA[a]; B=LB[a]; re=analyze_overaln (A, B, "_case_l_",t,f,p1,p2,p3); fprintf (stdout, "\n%d: sn: %.2f sp: %.2f re: %.2f F: %d P: %d P2: %d T: %d",a, re[0],re[1],re[2],f, p1,p2,t); tsn+=re[0]; tsp+=re[1]; tre+=re[2]; vfree(re); } fprintf (stdout, "\nTOT: sn: %.2f sp: %.2f re: %.2f F: %d P: %d P2: %d T: %d", tsn/(D1->A)->nseq,tsp/(D1->A)->nseq, tre/(D1->A)->nseq,f,p1,p2,t); exit (0); } else if ( strm ( action, "overaln_list_scan")) { float *re, tre=0, tsn=0, tsp; int p1,p2, p3, t, f; FILE *fp; char fname [100]; Alignment **LA; Alignment **LB; if ( ACTION(1))sprintf ( fname, "%s", ACTION(1)); else sprintf ( fname, "scan_results.txt"); fprintf ( stdout, "SCAN Results will be ouput in %s\n", fname); LA=vcalloc ((D1->A)->nseq, sizeof (Alignment*)); LB=vcalloc ((D2->A)->nseq, sizeof (Alignment*)); for (a=0; a<(D1->A)->nseq; a++) { LA[a]=main_read_aln ((D1->A)->name[a], NULL); LB[a]=main_read_aln ((D2->A)->name[a], NULL); } for (f=32; f<=40; f++) { for (p1=90; p1<=100; p1+=5) { for ( t=1; t<=3; t++) { for (p2=0; p2<=40; p2+=5) { for (p3=0;p3<=0;p3+=5) { tre=tsn=tsp=0; for ( a=0; a<(D1->A)->nseq; a++) { Alignment *A, *B; A=LA[a]; B=LB[a]; re=analyze_overaln (A, B, "_case_l_",t,f,p1,p2,p3); tsn+=re[0]; tsp+=re[1]; tre+=re[2]; vfree (re); } fp=vfopen (fname, "a"); fprintf (fp, "\nTOT: sn: %.2f sp: %.2f re: %.2f P: %d P2: %d P3: %d T: %d F: %d", tsn/(D1->A)->nseq,tsp/(D1->A)->nseq, tre/(D1->A)->nseq, p1,p2, p3,t,f); fprintf (stderr, "\nTOT: sn: %.2f sp: %.2f re: %.2f P: %d P2: %d P3: %d T: %d F: %d", tsn/(D1->A)->nseq,tsp/(D1->A)->nseq, tre/(D1->A)->nseq, p1,p2, p3,t,f); vfclose (fp); } } } } } exit (0); } else if ( strm ( action, "overaln"))//Evaluate the capacity to predict over-aligned regions { OveralnP *F; F=vcalloc (1, sizeof (OveralnP)); //al1: ref //al2: alignment //ATOI(1): P (0-100) //ATOI(2): T (0-9) float *r; DST=vcalloc (1,sizeof(Sequence_data_struc)); DST->A=aln2gap_cache (D1->A,0); lower_value=0; upper_value=0; D1->A=filter_aln_upper_lower (D1->A, DST->A, 0, 0); sprintf (F->mode, "%s", ((s=get_string_variable ("overaln_mode")))?s:"lower"); if (!strm (F->mode, "lower") && !strm (F->mode, "unalign"))printf_exit (EXIT_FAILURE,stderr,"\nERROR: unknown overal_mode in overal output [%s] [FATAL:%s]", F->mode, PROGRAM); if (int_variable_isset ("overaln_threshold"))F->t=get_int_variable ("overaln_threshold"); if (int_variable_isset ("overaln_target"))F->f=get_int_variable ("overaln_target"); if (int_variable_isset ("overaln_P1"))F->f=get_int_variable ("overaln_P1"); if (int_variable_isset ("overaln_P1"))F->f=get_int_variable ("overaln_P2"); if (int_variable_isset ("overaln_P1"))F->f=get_int_variable ("overaln_P3"); if (int_variable_isset ("overaln_P1"))F->f=get_int_variable ("overaln_P4");//F P1 P2 P3 T; D2->A=aln2clean_pw_aln (D2->A, F); r=aln2pred (D1->A, D2->A,"case_l_"); fprintf ( stdout, "sn %.2f sp %.2f re %.2f\n", r[0], r[1], r[2]); exit (0); } //JM_START else if ( strm ( action, "aln2hitMat")) { aln2hitMat(D1->A, ACTION(1)); myexit (EXIT_SUCCESS); } //JM_END else { fprintf ( stderr, "\nWARNING: ACTION %s UNKNOWN and IGNORED\n", action); } } void aln2mat_diaa (Sequence *S) { int a, aa1, aa2, aa3, aa4; int s1, s2, p; Alignment *A; int ****m; int **c; int naa=0; int count=0; double Delta=0.00001; int *alp; int tot,u; double observed, expected, f_diaa1, f_diaa2, v; alp=vcalloc (256, sizeof (int)); for (a=0; a<26; a++)alp[a+'a']=1; alp['b']=0; alp['j']=0; alp['o']=0; alp['u']=0; alp['x']=0; alp['z']=0; m=declare_arrayN (4,sizeof (int),26,26,26,26); c=declare_arrayN (2,sizeof (int),26,26); for ( a=0; a< S->nseq; a++) { fprintf ( stderr, "%s\n", S->name[a]); A=main_read_aln (S->name[a],NULL); for (s1=0; s1nseq; s1++)lower_string (A->seq_al[s1]); for ( s1=0; s1nseq-1; s1++) for (s2=s1+1; s2nseq; s2++) { for (p=0; plen_aln-1; p++) { u =alp[aa1=A->seq_al[s1][p]]; u+=alp[aa2=A->seq_al[s1][p+1]]; u+=alp[aa3=A->seq_al[s2][p]]; u+=alp[aa4=A->seq_al[s2][p+1]]; if ( u==4) { aa1-='a';aa2-='a';aa3-='a'; aa4-='a'; c[aa1][aa2]++; c[aa3][aa4]++; m[aa1][aa2][aa3][aa4]++; count+=2; } } } free_aln (A); } fprintf ( stdout, "# DIAA_MATRIX_FORMAT_01\n"); naa=26; for (aa1=0; aa10)fprintf ( stdout, "TEST C=%d expected=%.4f observed=%.4f v=%.4f [%d %d %d][%d] tot=%d\n", count, (float)expected, (float)observed, (float) v, c[aa1][aa2], c[aa3][aa4], count, m[aa1][aa2][aa3][aa4], tot); fprintf ( stdout, "%c%c %c%c %d %d\n", aa1+'a', aa2+'a', aa3+'a', aa4+'a', (int)v, m[aa1][aa2][aa3][aa4]+ m[aa3][aa4][aa1][aa2]); } } exit (EXIT_SUCCESS); } void aln2mat (Sequence *S) { int a, aa1, aa3; int s1, s2, p; Alignment *A; int **m; int *c; int naa=0; int count=0; double Delta=0.00001; int *alp; int tot,u; double observed, expected, f_diaa1, f_diaa2, v; alp=vcalloc (256, sizeof (int)); for (a=0; a<26; a++)alp[a+'a']=1; alp['b']=0; alp['j']=0; alp['o']=0; alp['u']=0; alp['x']=0; alp['z']=0; m=declare_arrayN (2,sizeof (int),26,26); c=declare_arrayN (1,sizeof (int),26); for ( a=0; a< S->nseq; a++) { fprintf ( stderr, "%s\n", S->name[a]); A=main_read_aln (S->name[a],NULL); for (s1=0; s1nseq; s1++)lower_string (A->seq_al[s1]); for ( s1=0; s1nseq-1; s1++) for (s2=s1+1; s2nseq; s2++) { for (p=0; plen_aln-1; p++) { u =alp[aa1=A->seq_al[s1][p]]; u+=alp[aa3=A->seq_al[s2][p]]; if ( u==2) { aa1-='a';aa3-='a'; c[aa1]++; c[aa3]++; m[aa1][aa3]++; count+=2; } } } free_aln (A); } fprintf ( stdout, "# MONOAA_MATRIX_FORMAT_01\n"); naa=26; for (aa1=0; aa10)fprintf ( stdout, "TEST C=%d expected=%.4f observed=%.4f v=%.4f [%d %d %d][%d] tot=%d\n", count, (float)expected, (float)observed, (float) v, c[aa1][aa2], c[aa3][aa4], count, m[aa1][aa2][aa3][aa4], tot); fprintf ( stdout, "%c %c %d %d\n", aa1+'a', aa3+'a', (int)v, m[aa1][aa3]+ m[aa3][aa1]); } } exit (EXIT_SUCCESS); } int **seq2latmat ( Sequence *S, char *fname) { int a, b, r0, r1; char *aa; int naa; int *count, tot; int **mat; double observed, expected; FILE *fp; fp=vfopen (fname, "w"); count=vcalloc ( 256, sizeof (int)); mat=declare_int (256, 256); naa=strlen ( BLAST_AA_ALPHABET); aa=vcalloc ( naa+2, sizeof (char)); sprintf ( aa, "%s", BLAST_AA_ALPHABET); lower_string (aa); for ( tot=0,a=0; a< S->nseq; a++) { ungap (S->seq[a]); for ( b=1; blen[a]; b++) { r0=tolower(S->seq[a][b-1]); r1=tolower(S->seq[a][b]); mat[r0][r1]++; //count[r0]++; count[r1]++; tot++; } } for ( a=0; a< naa; a++) for (b=0; b< naa; b++) { if ( aa[a]=='*' || aa[b]=='*'); else { expected=((double)count[(int)aa[a]]/(double)tot)* ((double)count[(int)aa[b]]/(double)tot)*(double)tot; observed=((double)mat[(int)aa[a]][(int)aa[b]]); /* fprintf ( stderr, "\n%c=%d %c=%d Tot=%d Obs=%d Exp=%d\n", aa[a],count[aa[a]], aa[b],count[aa[b]],tot, mat[aa[a]][aa[b]],(int)expected); fprintf ( stderr, "\n%d", mat[aa[a]][aa[b]]); fprintf ( stderr, "\n%d", mat[aa[a]][aa[b]]); */ mat[(int)aa[a]][(int)aa[b]]=(expected==0 || observed==0)?0:((int)10*log((observed/expected))); } } fprintf (fp,"# BLAST_MATRIX FORMAT\n#ALPHABET=%s\n#TRANSITION MATRIX TRAINED ON %d Sequence\n#", BLAST_AA_ALPHABET, S->nseq); for (a=0; a< naa; a++)fprintf ( fp, "%3c ", toupper(aa[a])); fprintf (fp,"\n"); for (a=0; a< naa; a++) { fprintf (fp, "%c", toupper(aa[a])); for ( b=0; b< naa; b++) { fprintf (fp, "%3d ", mat[(int)aa[a]][(int)aa[b]]); } fprintf ( fp, "\n"); } vfclose (fp); vfree (count); vfree (aa); return mat; } double* mat2cmp ( int **mat1, int **mat2) { int a, b, n, x, y; double **list, *r; if ( !mat1 || !mat2) { fprintf ( stderr, "\nERROR: mat2cmp needs two matrices [FATAL:%s]", PROGRAM); myexit (EXIT_FAILURE); } for (n=0, a=0; a< 256; a++) for ( b=0; b<256; b++) { x=mat1[a][b]; y=mat2[a][b]; if ( x|| y)n++; } if ( n==0) return 0; list=declare_double (n, 2); for (n=0, a=0; a<256; a++) for ( b=0; b<256; b++) { x=mat1[a][b]; y=mat2[a][b]; if ( x || y) { list[n][0]=x; list[n][1]=y; n++; } } r=return_r (list, n); free_double(list, -1); return r; } int ** read_blast_matrix ( char *mat_name) { FILE *fp; int n_aa,aa1, aa2; int a, b, c; int **matrix; int value; char sbuf[VERY_LONG_STRING]; char buf[2]; char alp[257]; matrix=declare_int (256,256); vfree ( matrix[30]); matrix[30]=vcalloc(10000, sizeof (int)); fp=vfopen ( mat_name, "r"); while ( (c=fgetc(fp))=='#' || isspace(c) ) { char *p; fgets ( sbuf, VERY_LONG_STRING, fp); if ( (p=strstr (sbuf, "ALPHABET"))) sscanf (p, "ALPHABET=%s", alp); } ungetc(c, fp); lower_string (alp); n_aa=strlen (alp); for ( a=0; a< n_aa; a++) { fscanf ( fp, "%s ", buf); aa1=tolower(buf[0]); if ( aa1!=alp[a]) { fprintf ( stderr, "\nParsing_error when reading blast_matrix %s:\n%c %c",mat_name, aa1,alp[a]); fprintf ( stderr, "\n%c ", fgetc(fp)); myexit (EXIT_FAILURE); } for ( b=0; bnseq; a++) { if (S->seq_comment[a] && (s=strstr(S->seq_comment[a], "_FIRSTYEAR"))) { sscanf (s, "_FIRSTYEAR%d_", &first); } else first=1; for ( y=first,b=0; blen[a]; b++) { if ( !is_gap(S->seq[a][b])) { S->seq[a][b]='a'+((y/modulo))%10; y++; } } if ( (s=strstr ( S->name[a], "_agechannel"))) { sprintf ( s, "%s", new_channel); } else strcat (S->name[a], new_channel); } return S; } Sequence* output_n_pavie_age_channel (Sequence *S, char *name, int n) { int x, a; if (!n)n=2; for ( x=1,a=0; a< n; a++, x*=10) { S=output_pavie_age_channel(S, name,x); } return S; } Sequence* output_pavie_age_channel (Sequence *S, char *name, int modulo) { Alignment *A; FILE *fp; static int display; char mat_list_name[100]; char seq_list[1000]; char mat_name[1000]; char *tmp; sprintf ( mat_list_name, "%s_pavie_age_matrix.mat_list", name); sprintf (seq_list, "%s_age_channel.fasta",name); if ( display==0 ) { if (check_file_exists(seq_list))vremove (seq_list); if (check_file_exists(mat_list_name))vremove (mat_list_name); } sprintf (mat_name, "%s_age_mat_mod%d.mat",name, modulo); output_age_matrix ( mat_name, modulo); fp=vfopen ( mat_list_name,"a"); fprintf ( fp, "%s\n", mat_name); vfclose ( fp); S=seq2year (S,modulo); A=seq2aln (S, NULL, KEEP_GAP); output_fasta_seq (tmp=vtmpnam (NULL),A); file_cat ( tmp, seq_list); if ( display==0) { display_output_filename ( stdout, "AGE_MAT_LIST", "MAT_LIST", mat_list_name, CHECK); display_output_filename ( stdout, "AGE_SEQ", "FASTA", seq_list, CHECK); display=1; } fprintf ( stderr, "\nModulo:%d years", modulo); fprintf ( stderr, "\n"); free_aln (A); return S; } // // Name MAnipulation // Alignment *clean_aln (Alignment *A) { if ( A) { A->seq_comment=clean_string (A->nseq, A->seq_comment); A->aln_comment=clean_string (A->nseq, A->aln_comment); A->name=translate_names(A->nseq, A->name); (A->S)=clean_sequence ((A->S)); } return A; } Sequence *clean_sequence ( Sequence *S) { if ( !S) return S; S->seq_comment=clean_string (S->nseq, S->seq_comment); S->name=translate_names(S->nseq, S->name); return S; } char ** translate_names (int n, char **name) { int a; for ( a=0; a<", name[a]))name[a]='_'; } sprintf (buf,"%s",decode_name (name, DECODE)); if ( strlen (buf)>read_array_size_new ((char *)name)) { name=vrealloc (name, sizeof (char)*(strlen (buf)+1)); } sprintf (name, "%s", buf); return name; } char *decode_name (char *name, int mode) { static char ***name_list; static int n; static char tag[100]; int a; if (mode==CLEAN) { for (a=0; a %s\n", name_list[a][0], name_list[a][1]); return file; } if (mode ==DECODE && name_list==NULL)return name; if ( name==NULL) return name; if (!tag[0]) { vsrand (0); sprintf ( tag, "TCTAG_%d",rand ()%100000); } if ( mode==CODE) { for (a=0; a< n; a++) if ( strm (name, name_list[a][0]))return name_list[a][1]; name_list=realloc (name_list, sizeof (char**)*(n+1)); name_list[n]=vcalloc (2, sizeof (char*)); name_list[n][0]=vcalloc (strlen (name)+1, sizeof (char)); name_list[n][1]=vcalloc (100, sizeof (char)); sprintf ( name_list[n][0], "%s", name); sprintf ( name_list[n][1], "%s_%d", tag,n+1); return name_list[n++][1]; } else if ( mode ==DECODE) { char *p; int i; if ( !(p=after_strstr (name, tag)))return name; else { sscanf (p, "_%d", &i); return name_list[i-1][0]; } } else { printf_exit (EXIT_FAILURE, stderr,"Unknown Mode for Decode_name [FATAL:%s]", PROGRAM); } return NULL; } FILE * display_sequences_names (Sequence *S, FILE *fp, int check_pdb_status, int print_templates) { int a; int max_len; char *r; if ( !S) { fprintf (fp,"\nERROR: NO SEQUENCE READ [FATAL:%s]\n", PROGRAM); myexit (EXIT_FAILURE); } for ( a=0, max_len=0; a< S->nseq; a++)max_len=MAX(max_len, strlen (S->name[a])); fprintf ( fp, "\nINPUT SEQUENCES: %d SEQUENCES [%s]", S->nseq,(S->type)?S->type:"Unknown type"); for ( a=0; a< S->nseq; a++) { fprintf (fp, "\n Input File %-*s Seq %-*s Length %4d type %s",max_len,S->file[a], max_len,S->name[a],(int)strlen ( S->seq[a]), S->type); if (check_pdb_status) { if ((r=seq_is_pdb_struc (S, a)))fprintf (fp, " Struct Yes PDBID %s", get_pdb_id(r)); else fprintf (fp, " Struct No"); /* if (is_pdb_struc (S->name[a])||is_pdb_struc (S->file[a]) )fprintf (fp, " Struct Yes"); else fprintf (fp, " Struct No"); */ } else fprintf (fp, " Struct Unchecked"); if ( print_templates)fp=display_sequence_templates (S, a, fp); } fprintf ( fp, "\n"); return fp; } Sequence *add_file2file_list (char *name, Sequence *S) { if (!S) S=declare_sequence (1,1,10); else S=realloc_sequence (S,S->nseq+1,0);S->nseq=0; sprintf ( S->name[S->nseq++], "%s", name); return S; } /*********************************COPYRIGHT NOTICE**********************************/ /*© Centro de Regulacio Genomica */ /*and */ /*Cedric Notredame */ /*Tue Oct 27 10:12:26 WEST 2009. */ /*All rights reserved.*/ /*This file is part of T-COFFEE.*/ /**/ /* T-COFFEE is free software; you can redistribute it and/or modify*/ /* it under the terms of the GNU General Public License as published by*/ /* the Free Software Foundation; either version 2 of the License, or*/ /* (at your option) any later version.*/ /**/ /* T-COFFEE is distributed in the hope that it will be useful,*/ /* but WITHOUT ANY WARRANTY; without even the implied warranty of*/ /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the*/ /* GNU General Public License for more details.*/ /**/ /* You should have received a copy of the GNU General Public License*/ /* along with Foobar; if not, write to the Free Software*/ /* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA*/ /*............................................... |*/ /* If you need some more information*/ /* cedric.notredame@europe.com*/ /*............................................... |*/ /**/ /**/ /* */ /*********************************COPYRIGHT NOTICE**********************************/