8 #include "io_lib_header.h"
9 #include "util_lib_header.h"
10 #include "dp_lib_header.h"
11 #include "define_header.h"
12 #include "dev1_lib_header.h" //JM_STRAT
14 #define ACTION(x) ((n_actions>=(x+1))?action_list[x]:NULL)
15 #define ACTION2(x,y) ((n_actions>=(x+1))?action_list[x]:y)
16 #define ATOI_ACTION(x) ((ACTION(x)!=NULL)?(atoi(ACTION(x))):0)
18 /**************************************************************************************************/
19 /***************************** SEQ_REFORMAT ******************************************/
20 /**************************************************************************************************/
21 int output_transitions(char *outfile, Alignment *A);
22 static int output_age_matrix ( char *outfile, int val);
23 int SeqGCGCheckSum(char *seq, int len);
24 static Sequence *seq2year ( Sequence *S, int modulo);
25 static Sequence* output_n_pavie_age_channel (Sequence *S, char *name, int n);
26 static Sequence* output_pavie_age_channel (Sequence *S, char *name, int modulo);
28 static int output_seq2struc(char *outfile, Alignment *A);
29 void output_conservation_statistics ( char *file, Alignment *A);
30 /**************************************************************************************************/
31 /***************************** SEQ_REFORMAT ******************************************/
32 /**************************************************************************************************/
33 int seq_reformat ( int argc, char **in_argv)
36 Sequence_data_struc *D1=NULL;
37 Sequence_data_struc *D2=NULL;
38 Sequence_data_struc *D_ST=NULL;
39 Action_data_struc *RAD;
52 char *struc_in_format;
53 char *struc_out_format;
60 char ***rename_list=NULL;
68 RAD=vcalloc ( 1, sizeof ( Action_data_struc));
70 declare_name (cache);sprintf ( cache, "use");
71 declare_name(in_file);
72 declare_name(in2_file);
73 declare_name(out_file);
74 declare_name(out2_file);
75 declare_name(struc_in_format);
76 declare_name(struc_out_format);
77 declare_name(RAD->coor_file);
79 declare_name(struc_in_file);
80 declare_name(struc_out_file);
81 declare_name(in_format);
82 declare_name(in2_format);
83 declare_name(out_format);
84 declare_name(rename_file);
87 argv=break_list ( in_argv, &argc, "=;, \n");
89 action_list=declare_char ( 100, 100);
91 /*END INITIALIZATION*/
93 addrandinit ( (unsigned long) 500);
95 if ( argc==1 || strm6 ( argv[1], "h", "-h", "help", "-help", "-man", "?"))
98 fprintf ( stdout, "\n%s (%s,%s,%s [%s])\n",PROGRAM, VERSION,AUTHOR, DATE, URL);
99 fprintf ( stdout, "\n*********** MINIMUM SYNTAX *****************");
100 fprintf ( stdout, "\nseq_reformat -in <in_file> -output <out_format>");
101 fprintf ( stdout, "\nSome File formats are automatically recognised");
102 fprintf ( stdout, "\nSee Format section");
103 fprintf ( stdout, "\n");
104 fprintf ( stdout, "\n*********** MAIN FLAGS ******************");
105 fprintf ( stdout, "\n-in name........Name of the file read");
108 fprintf ( stdout, "\n-input format......Name of the format read, see Input Format Section");
109 fprintf ( stdout, "\n...................Automatic detection, except for seqs of numbers");
110 fprintf ( stdout, "\n...................-input number_aln | number_fasta");
111 fprintf ( stdout, "\n-in2 fname......Second alignment");
112 fprintf ( stdout, "\n-input2 format.....See -input");
113 fprintf ( stdout, "\n-exon_boundaries obj file");
114 fprintf ( stdout, "\n-out fname......Output file (defualt is STDOUT");
115 fprintf ( stdout, "\n-output format.....Output Format, default is fasta_aln");
116 fprintf ( stdout, "\n-struc_in name...File containing a coded aln");
117 fprintf ( stdout, "\n-struc_in_f format.See -input and output format section");
118 fprintf ( stdout, "\n-struc_out fname..Name of the output structure");
119 fprintf ( stdout, "\n-struc_out_f symbol");
120 fprintf ( stdout, "\n-keep_case=on|off..keep case, On by default");
121 fprintf ( stdout, "\n-action +ac1 +ac2..See the action section");
122 fprintf ( stdout, "\n-rename <file>.....Rename the sequences following <file> indications");
123 fprintf ( stdout, "\n...................File Format: One couple <name1><space><name2>/line");
124 fprintf ( stdout, "\n...................Rename order <name1> into <name2>");
125 fprintf ( stdout, "\n...................code file: -output code_name");
126 fprintf ( stdout, "\n-code <file> Rename file <name1> to <name2>");
127 fprintf ( stdout, "\n-decode <file> Rename file <name2> to <name1>");
128 fprintf ( stdout, "\n-no_warning........Suppresses all warnings");
129 fprintf ( stdout, "\n-cache.............use,ignore,update,local, DirectoryName");
132 fprintf ( stdout, "\n");
134 fprintf ( stdout, "\n*********** REFORMAT ACTIONS *****************");
135 fprintf ( stdout, "\n +Xaction.............Specifies which file undergoes the action");
136 fprintf ( stdout, "\n +Xaction.............X=1: -in");
137 fprintf ( stdout, "\n +Xaction.............X=2: -in2");
138 fprintf ( stdout, "\n +Xaction.............X=3: -struc_in");
139 fprintf ( stdout, "\n +name2unique_name....replace duplicated name with name_#");
140 fprintf ( stdout, "\n +swap_header........,swapp comments: replace comments/name in 1 by in 2");
141 fprintf ( stdout, "\n +swap_lib_header.F...Replace the sequences in the tc_lib (-in) with those in F");
142 fprintf ( stdout, "\n .....................F is a legal FASTA file");
145 fprintf ( stdout, "\n +translate[0-2]......Translate on Frame 0, 1, 2 ");
146 fprintf ( stdout, "\n +translate[3]........longuest ORF on direct strand");
147 fprintf ( stdout, "\n +translate[4]........longuest ORF on direct+complementary strand");
150 fprintf ( stdout, "\n +add_scale..<offset>.addscale below aln");
152 fprintf ( stdout, "\n +rm_gap n ...........Removes col with n%% gap [n=100]");
153 fprintf ( stdout, "\n +rmgap_col SEQ1:SEQ2.Removes column with a gap in SEQ [#] ");
155 fprintf ( stdout, "\n +backtranslate.......Random Backtranslation");
156 fprintf ( stdout, "\n +complement..........Produces the reverse complement");
158 fprintf ( stdout, "\n +reorder.............Reorders sequences of <in> according to <in2>");
159 fprintf ( stdout, "\n .........random......Random_order");
160 fprintf ( stdout, "\n .........tree........Tree Order (in2)");
161 fprintf ( stdout, "\n +reorder_columns.....Reorders sequences of <in> according to <in2>");
162 fprintf ( stdout, "\n .........random......Random_order");
163 fprintf ( stdout, "\n .........tree..mode..Tree Order (comuted with mode: sarmat, idmat, blosum62mt...");
164 fprintf ( stdout, "\n +aln2random_aln SCR..Randomize the aln, S: swap sequences names");
165 fprintf ( stdout, "\n .....................Swap residues within colums");
166 fprintf ( stdout, "\n .....................Swap residues across the aln");
167 fprintf ( stdout, "\n +aln2sample......N......");
168 fprintf ( stdout, "\n +aln2bootstrap...N......");
171 fprintf ( stdout, "\n +chain...............Identifies all the intermediate sequences from <-in>");
172 fprintf ( stdout, "\n .....................needed to join every sequence pair in <-in2>");
174 fprintf ( stdout, "\n +aln2cons mat_name..Ouputs a consensus sequence");
175 fprintf ( stdout, "\n .....................The consensus is determined using mat");
176 fprintf ( stdout, "\n .....................By Default, mat=blosum62mt, name=Cons");
177 fprintf ( stdout, "\n +aln2resindex........Prints the sequence index of each residue in -in for each -in2 sequence");
178 fprintf ( stdout, "\n +collapse_aln <new name> <seq1> <seq2...> | file name");
179 fprintf ( stdout, "\n .....................Replaces a group of sequences with its consensus");
180 fprintf ( stdout, "\n .....................The replacement sequence is named <new_seq>");
181 fprintf ( stdout, "\n .....................List of sequences can be provided via a file");
182 fprintf ( stdout, "\n .....................File:>new_name seq1 seq2 seq3....");
183 fprintf ( stdout, "\n +original_seqnos.....Keep original seqnos [SWITCH]");
184 fprintf ( stdout, "\n +seqnos..............Print Seqnos [SWITCH]");
185 fprintf ( stdout, "\n +code_dna_aln........Undocumented") ;
186 fprintf ( stdout, "\n +grep..[NAME|SEQ|COMMENT]..[KEEP|REMOVE]..[string]......");
187 fprintf ( stdout, "\n .....................Keeps or Removes Sequences matching string");
188 fprintf ( stdout, "\n +extract_block <seq> <start> <end> | <seq> <pos> |<filename>");
189 fprintf ( stdout, "\n .....................Extract column pos OR [start to end[");
190 fprintf ( stdout, "\n .....................<filename> Format");
191 fprintf ( stdout, "\n .......................seq start end | seq pos");
192 fprintf ( stdout, "\n .......................# for comments");
193 fprintf ( stdout, "\n .......................! seq offset_value (0 by default)");
194 fprintf ( stdout, "\n .....................Can extract as many positions as needed");
195 fprintf ( stdout, "\n .....................seq=cons: measure positions on the full aln");
196 fprintf ( stdout, "\n +cat_aln.............Concatenates the alignments input via -in and -in2");
197 fprintf ( stdout, "\n +cat_aln.............-if no -in2, -in is expected to be a list of alignments to concatenate");
199 fprintf ( stdout, "\n +msalist2cat_pwaln.min..max");
200 fprintf ( stdout, "\n .....................extract all pw projections and conctaenates those\n");
201 fprintf ( stdout, "\n .....................where id>=min and id<=max\n");
202 fprintf ( stdout, "\n .....................min and max can be omitted (min=0, max=100)\n");
204 fprintf ( stdout, "\n +seq2blast <matrix>..gather all possible homologues from NR (EBI BLAST)");
205 fprintf ( stdout, "\n +seq2msa <matrix>....makes a standard progressive alignment using matrix");
206 fprintf ( stdout, "\n +realign_block <c1> <c2> <pg>");
207 fprintf ( stdout, "\n .....................Realign column c1 to c2 (non inc.) with pg)");
208 fprintf ( stdout, "\n .....................pg reads fasta and outputs fasta");
209 fprintf ( stdout, "\n .....................pg -infile=<infile> -outfile=<outfile>");
210 fprintf ( stdout, "\n +extract_seq seq_name (start end seq_name start end...) | filename");
211 fprintf ( stdout, "\n .....................seq_name='*': every seq");
212 fprintf ( stdout, "\n .....................start='*' : real start");
213 fprintf ( stdout, "\n .....................end='*' : real end");
214 fprintf ( stdout, "\n .....................filename: fasta format");
215 fprintf ( stdout, "\n +extract_seq_list name1 name2");
216 fprintf ( stdout, "\n .....................Extracts entire sequences");
217 fprintf ( stdout, "\n +remove_seq sn1 sn2..Removes sequences sn1, sn2...");
218 fprintf ( stdout, "\n +remove_seq empty....Removes empty sequences (gap only)");
219 fprintf ( stdout, "\n +remove_seq unique...Remove all multiple occurences except the first");
220 fprintf ( stdout, "\n +thread_profile_on_msa <file>");
221 fprintf ( stdout, "\n .....................Threads a list of profiles on corresponding seq");
222 fprintf ( stdout, "\n .....................File: >seqname _R_ <msa file> [nlines]");
224 fprintf ( stdout, "\n +thread_dna_on_prot_aln");
225 fprintf ( stdout, "\n .....................-in DNA.seq and -in2 AA.aln");
226 fprintf ( stdout, "\n +thread_struc_on_aln");
227 fprintf ( stdout, "\n .....................-in structure and -in2 aln");
228 fprintf ( stdout, "\n +use_cons............Use the consensus for n[SWITCH]");
229 fprintf ( stdout, "\n +upper.n|[n1-n2].....n omitted sets everything to upper case");
230 fprintf ( stdout, "\n .....................To use n: provide a number_aln via:");
231 fprintf ( stdout, "\n .....................-struc_in <number_file> -struc_in_f number_aln");
232 fprintf ( stdout, "\n .....................if use_cons is set n, is read on the cons");
233 fprintf ( stdout, "\n .....................n: will upper every residue with a value of n in struc_in");
234 fprintf ( stdout, "\n .....................[n1-n2]: upper residues between n1 and n2");
235 fprintf ( stdout, "\n +lower n|[n1-n2]....See +upper");
236 fprintf ( stdout, "\n +switchcase n|[n1-n2]See +upper");
237 fprintf ( stdout, "\n +color_residue <seq> <pos> <color> | file");
238 fprintf ( stdout, "\n .....................File: seq_name pos color");
239 fprintf ( stdout, "\n .....................color: 0-9");
240 fprintf ( stdout, "\n +edit_residue <seq> <pos> <edit> | file");
241 fprintf ( stdout, "\n .....................File: seq_name pos color");
242 fprintf ( stdout, "\n .....................edit: upper|lower|symbol");
246 fprintf ( stdout, "\n +keep n|[n1-n2]....Only keep residues that have a score between n1 and n2");
248 fprintf ( stdout, "\n +invert..............Inverts the sequences: CAT => TAC");
249 fprintf ( stdout, "\n +rotate name Rotate an MSA, names each sequence name_col#");
250 fprintf ( stdout, "\n +convert n|[n1-n2] s1 s2 ....");
251 fprintf ( stdout, "\n +merge_annotation.... ");
253 fprintf ( stdout, "\n .....................Converts residues with your alignment");
254 fprintf ( stdout, "\n .....................similar to upper");
255 fprintf ( stdout, "\n .....................s1: ABCDe turns every ABCD into e");
256 fprintf ( stdout, "\n .....................s1: #e turns any residue into e");
257 fprintf ( stdout, "\n aln2short_aln L C S..Turns sequences into shorter sequences");
258 fprintf ( stdout, "\n .....................L: list of residues to keep");
259 fprintf ( stdout, "\n .....................S: Size of Streches replaced by symbol C");
262 fprintf ( stdout, "\n +random n l..........Generates N random sequences of len l");
263 fprintf ( stdout, "\n .....................You must provide a file with -in");
264 fprintf ( stdout, "\n +count n|[n1-n2] s1 s2....");
265 fprintf ( stdout, "\n .....................Counts residues with your alignment");
266 fprintf ( stdout, "\n .....................similar to convert");
267 fprintf ( stdout, "\n +print_format........prints the format name");
268 fprintf ( stdout, "\n +keep_name...........Keep the original sequence name on extraction");
270 fprintf ( stdout, "\n +remove_aa pos Ml Ncycle Random_len");
271 fprintf ( stdout, "\n .....................Randomly modifies an alignment");
272 fprintf ( stdout, "\n .....................pos=0: chosen randomly");
273 fprintf ( stdout, "\n .....................MaxLen of the deletions, Ncycle: number of cycles");
274 fprintf ( stdout, "\n .....................Random_len: 0 sets the len to maxlen, 1 to a random value");
275 fprintf ( stdout, "\n +remove_nuc.x........Remove Position 1, 2 or 3 of every codon");
276 fprintf ( stdout, "\n +evaluate matrix..gop..gep");
277 fprintf ( stdout, "\n .....................Make a similarity evaluation with matrix");
278 fprintf ( stdout, "\n .....................use -output=score_ascii, or score_html.");
279 fprintf ( stdout, "\n .....................You can filter on the values");
280 fprintf ( stdout, "\n +evaluate matrix..gop..gep");
281 fprintf ( stdout, "\n .....................Make an SP evaluation with matrix");
282 fprintf ( stdout, "\n .....................Uses Natural Gap penalties");
283 fprintf ( stdout, "\n .....................gop and gep must be negative");
284 fprintf ( stdout, "\n .....................use -output=color_ascii, color_html to get a color display");
286 fprintf ( stdout, "\n.....+evaluate_lat........Make a lateral evaluation with matrix");
287 fprintf ( stdout, "\n +msa_weight proc.....Computes weights using the procedure");
288 fprintf ( stdout, "\nRNA analysis Post Processing___________________________________________________");
289 fprintf ( stdout, "\n +aln2alifold.........Turns the MSA into a consensus structure");
290 fprintf ( stdout, "\n +add_alifold.........adds an alifold consensus structure");
292 fprintf ( stdout, "\n +alifold2analyze.mode..mode=stat_cache_list_aln_color_html_ps_usegap");
293 fprintf ( stdout, "\n .......................stat: compile Number of compensated mutations");
294 fprintf ( stdout, "\n .......................cache: ascii-code compensated mutations on aln");
295 fprintf ( stdout, "\n .......................html: color-code compensated mutations on aln");
296 fprintf ( stdout, "\n .......................aln: mark compensated mutations on stockholm aln");
297 fprintf ( stdout, "\n .......................usegap: do not ignore positions with gaps");
299 fprintf ( stdout, "\n +cmp_RNAfold.........compares the sec struc of in1 and in2 (computes them with alifold if missing)");
301 fprintf ( stdout, "\nMSA Post Processing___________________________________________________");
302 fprintf ( stdout, "\n +force_aln filename|seq1 res1 seq2 res2");
303 fprintf ( stdout, "\n .....................Forces residue 1 of seq1 to be aligned with res2 of seq 2");
304 fprintf ( stdout, "\n .....................In a file, there must be one pair of interaction/line");
305 fprintf ( stdout, "\n +sim_filter[_aln_Ix_iy_Cz_cw <seq>");
306 fprintf ( stdout, "\n ....................._<unaln or aln>, aln is assumed");
307 fprintf ( stdout, "\n ....................._I max identity to seq");
308 fprintf ( stdout, "\n ....................._i min identity to seq");
309 fprintf ( stdout, "\n ....................._C max cov on seq");
310 fprintf ( stdout, "\n ....................._c min cov on seq");
311 fprintf ( stdout, "\n +trim[_aln_%%%%50_n111_N50_T_Fn_fS_pS_max_sim] [string2]");
312 fprintf ( stdout, "\n ....................._<seq or aln>, aln is assumed");
313 fprintf ( stdout, "\n ....................._%%%%<max/min_percent_similarity>");
314 fprintf ( stdout, "\n ....................._max Or _min <keep sequences for which sim is the max or the min [Def: _max>");
315 fprintf ( stdout, "\n ....................._cov Or _sim Filter according to the coverage [Def: _sim]");
316 fprintf ( stdout, "\n ....................._n<max_number_of_sequence> ");
317 fprintf ( stdout, "\n ....................._N<percent_of_sequences_to_keep>");
318 fprintf ( stdout, "\n ....................._T Reorder the sequences according to a tree BEFORE triming");
319 fprintf ( stdout, "\n ....................._Fn Keep only sequences that have AT LEAST ONE residue aligned");
320 fprintf ( stdout, "\n ......................in the n first and n last columns. ");
321 fprintf ( stdout, "\n ....................._O<min sim> Remove outlayers that have less than min average sim with other sequences");
323 fprintf ( stdout, "\n .....................Keeping Sequences: Sequences provided via -in2 will be kept");
325 fprintf ( stdout, "\n .....................Keeping Sequences: Sequences whose name contains <string> in field fS will be kept");
326 fprintf ( stdout, "\n ....................._f<NAME|SEQ|COMMENT> designates a field");
327 fprintf ( stdout, "\n .....................<string> is a Perl regular expression");
328 fprintf ( stdout, "\n +aln2unalign Mode Penalty Threshold");
329 fprintf ( stdout, "\n .....................Identifies all the streches less conserved than than the average");
330 fprintf ( stdout, "\n .....................Mode: lower|number|unalign Act on all the resiues withs score<Thres");
331 fprintf ( stdout, "\n .....................Penalty: FSA penalty align2unalign, Def=90");
332 fprintf ( stdout, "\n .....................Threshold: Fraction of unaligned residues(0-9) Def=2");
334 fprintf ( stdout, "\n +clean_cdna..........Undocumented");
335 fprintf ( stdout, "\n +clean_maln..........Undocumented");
336 fprintf ( stdout, "\nTree Analysis___________________________________________________");
339 fprintf ( stdout, "\n +tree_prune..........Prune the tree -in using the sequences provided via -in2");
340 fprintf ( stdout, "\n +tree_cmp............Compares the tree -in and the tree -in2");
341 fprintf ( stdout, "\n .....................-in and -in2 can contain different taxons");
342 fprintf ( stdout, "\n +tree_scan.P1..P2.....scans alignment <-in> with tree <-in2>)");
343 fprintf ( stdout, "\n ......................+tree_scan help to get P1 information");
344 fprintf ( stdout, "\n ......................+aln2tree help to get P2 information");
346 fprintf ( stdout, "\n .....................-in and -in2 can contain different taxons");
347 fprintf ( stdout, "\n +treelist2groups.N....count all topologies within a list of trees");
348 fprintf ( stdout, "\n .....................-in is in fasta format with each name being a newick file");
349 fprintf ( stdout, "\n .....................-in2 can be a list of sequences used to trim the trees");
350 fprintf ( stdout, "\n ......................N can be used to unresolve the trees with Depth N");
351 fprintf ( stdout, "\n +treelist2lti.N.C.....Reports the average stability of each sequence neighborhood");
352 fprintf ( stdout, "\n ......................Species can be selected via -in2 [Fasta file with Taxon names]");
353 fprintf ( stdout, "\n ......................OR the sequences observed in C% of the files are kept [Def: C=100]");
356 fprintf ( stdout, "\n +treelist2seq.C.......Reports the species observed in C% of the trees");
357 fprintf ( stdout, "\n +treelist2splits......List and counts all the splits in a list of trees");
358 fprintf ( stdout, "\n ......................splits can be restricted to a list of sequences provided via -in2");
359 fprintf ( stdout, "\n +treelist2dmat.......outputs a distance matrix for a list of trees");
361 fprintf ( stdout, "\n +tree_compute n s....Computes a tree using the MSA provided with -in");
362 fprintf ( stdout, "\n ....................n:0-9, controls the way the MSA is filtered");
363 fprintf ( stdout, "\n ....................s:pam250mt|blosum62mt|categories|enthropy");
364 fprintf ( stdout, "\n ....................s:controls the column evaluation in MSA");
365 fprintf ( stdout, "\n +change_distances.f.f:float, sets all the distances to f in the tree");
366 fprintf ( stdout, "\n +change_bootstrap n..:n=0 removes all the bootstrap values");
367 fprintf ( stdout, "\n .....................:n!=0 adds a the value n to every node");
368 fprintf ( stdout, "\n +tree2dpatree........Replaces tree distances with the minimum %%ID in");
369 fprintf ( stdout, "\n .....................the depending subgroup. The ID is measured on an");
370 fprintf ( stdout, "\n .....................-in=TREE -in2=ALN");
371 fprintf ( stdout, "\n +unroot..............Removes the root in the input tree");
372 fprintf ( stdout, "\n +tree2group.N.I.P....Reports all the tree subgroup with at most Nseq");
373 fprintf ( stdout, "\n .....................and at min I%% identity. Output format can be read by");
374 fprintf ( stdout, "\n .....................collapse_tree. New groups are named P_1, P_2...");
375 fprintf ( stdout, "\n +collapse_tree.F.....Collapses trees. F is either a file or a list");
376 fprintf ( stdout, "\n .....................<new name> <seq1> <seq2>...");
377 fprintf ( stdout, "\n +aln2tree............Computes a tree");
378 fprintf ( stdout, "\n ..ktupN|aln|sarmat ktupN: match size N to estimate distances");
379 fprintf ( stdout, "\n .....................aln: Measures distances on aln");
380 fprintf ( stdout, "\n .....................sarmat: expects in to be a SAR matrix of O and I");
381 fprintf ( stdout, "\n ..nj | cw............Runs Neighbor Joining OR Cw to compute Tree");
382 fprintf ( stdout, "\n ..dpa................Turns the tree into a daptree (+tree2dpatree)");
383 fprintf ( stdout, "\n +node_sort..<name>...Sort leafs of tree n1, by node distance");
386 fprintf ( stdout, "\nMatrix Analysis___________________________________________________");
387 fprintf ( stdout, "\n +aln2mat_diaa........computes a dinucleotide matrix on a list of aln");
388 fprintf ( stdout, "\n +aln2mat.............computes a log odd matrix");
390 fprintf ( stdout, "\n +seq2lat_mat.........computes a transition matrix on seq provided via -in");
392 fprintf ( stdout, "\nStructure Analysis___________________________________________________");
393 fprintf ( stdout, "\n +struc2contacts.A.B D.Displays in capitals all the residues of A");
394 fprintf ( stdout, "\n ......................Less than D Angs from a residue of B");
395 fprintf ( stdout, "\n ......................A and B are pdb file, D is a distance in Angs");
396 fprintf ( stdout, "\n +seq2contacts.A.D.....Identifies all the residues in contact with ligands");
397 fprintf ( stdout, "\n ......................Ligands are in the FASTA header of struc in");
398 fprintf ( stdout, "\n ......................>Name _S_ [Target Struc] [Ligand1] [Chain] ...");
399 fprintf ( stdout, "\n ......................Output: number_fasta: 0=no contact, 1=ligand 1...");
400 fprintf ( stdout, "\n ......................9: residues in contact with more than 1 ligand");
401 fprintf ( stdout, "\n ......................Use -output=color_html/ascii to display result");
402 fprintf ( stdout, "\n +struc2nb...D.........Display a list of all the residues D appart");
403 fprintf ( stdout, "\n +rm_template...V......Removes _[S|G|R]_[template] to sequence names");
404 fprintf ( stdout, "\n ......................V: omitted | sequences <=> Output sequences");
405 fprintf ( stdout, "\n ......................V: template <=> Output templates");
407 fprintf ( stdout, "\n +add_template.F.......Add _[S|G|R]_[template] to sequence names");
408 fprintf ( stdout, "\n ......................F can either be a fasta file or an executable");
409 fprintf ( stdout, "\n ......................F: File: >name _S_ template");
410 fprintf ( stdout, "\n ......................F: executable: pg -infile=<seq> -outfile=<tagged>");
411 fprintf ( stdout, "\nMatrix Comparison___________________________________________________");
412 fprintf ( stdout, "\n +mat2cmp...............Returns the correlation coefficient between two matrices");
413 fprintf ( stdout, "\n .......................-in mat1 -input matrix, -in2 mat2 -input2 matrix");
414 fprintf ( stdout, "\n*********** INPUT FORMATS: Alignments *****************");
415 fprintf ( stdout, "\n AUTOMATIC RECOGNITION");
416 fprintf ( stdout, "\n perl_xxx:............. runs xxx onto the input file");
417 fprintf ( stdout, "\n xxxx <file> > outfile..xxx reads any formats, outputs fasta");
418 fprintf ( stdout, "\n amps_aln saga_aln ");
419 fprintf ( stdout, "\n clustal_aln fasta_aln msf_aln ");
420 fprintf ( stdout, "\n dali_aln gotoh_aln pima_aln");
421 fprintf ( stdout, "\n dialign_aln matrix conc_aln");
422 fprintf ( stdout, "\n NON AUTOMATIC RECOGNITION (use the -input file to specify the format");
423 fprintf ( stdout, "\n number_aln newick_tree");
424 fprintf ( stdout, "\n");
425 fprintf ( stdout, "\n*********** INPUT FORMATS: Sequences *****************");
426 fprintf ( stdout, "\n fasta_seq dali_seq pir_seq");
427 fprintf ( stdout, "\n barton_list_tc amps_sd_scores EST_fasta");
428 fprintf ( stdout, "\n gor_seq gor_struc number_fasta[*]");
429 fprintf ( stdout, "\n swissprot tc_lib pdb_struc");
430 fprintf ( stdout, "\n");
431 fprintf ( stdout, "\n*********** INPUT FORMATS: Structures *****************");
432 fprintf ( stdout, "\n rna_number");
433 fprintf ( stdout, "\n alifold");
434 fprintf ( stdout, "\n*********** OUTPUT FORMATS: Alignments ******************");
435 fprintf ( stdout, "\n compressed_aln saga_aln clustal_aln");
436 fprintf ( stdout, "\n phylip_aln msf_aln fasta_aln ");
437 fprintf ( stdout, "\n pir_aln ");
438 fprintf ( stdout, "\n color_html,color_ps......colored using the struc_in file ");
439 fprintf ( stdout, "\n color_protogene..........colors codons");
440 fprintf ( stdout, "\n color_exoset.............mixes conservation (gray) and introns (RGB)");
441 fprintf ( stdout, "\n color_pdf pw_lib_saga_aln tdna_aln");
442 fprintf ( stdout, "\n thread_dna_on_prot_aln");
443 fprintf ( stdout, "\n");
444 fprintf ( stdout, "\n*********** OUTPUT FORMATS: sequence ******************");
445 fprintf ( stdout, "\n fasta_seq fasta_seq1 gotoh_seq");
446 fprintf ( stdout, "\n gor_seq cache_id");
447 fprintf ( stdout, "\n");
448 fprintf ( stdout, "\n*********** OUTPUT FORMATS: weights ******************");
449 fprintf ( stdout, "\n constraints saga_pw_sd_weights nseq\n");
450 fprintf ( stdout, "\n");
451 fprintf ( stdout, "\n*********** OUTPUT Formats: special ****************");
452 fprintf ( stdout, "\n len name statistics<_hnrglNL>");
453 fprintf ( stdout, "\n sim............outputs a similarity matrix based on an id comparison of -in");
454 fprintf ( stdout, "\n sim_sarmat.....in is sar matrix");
455 fprintf ( stdout, "\n sim_idscore....makes dp alignment of the sequences using Blosum62mt");
456 fprintf ( stdout, "\n sim_idscoreDNA.makes dp alignment of the sequences using idmat");
457 fprintf ( stdout, "\n sim............if -in2 is set: in1 vs in2, idscore");
459 fprintf ( stdout, "\n code_name......Outputs a compact list of names for code/decode");
463 fprintf ( stdout, "\n");
466 fprintf ( stdout, "\n");
470 argv=standard_initialisation (argv, &argc);
473 for ( a=1; a< argc; a++)
475 if (a==1 && argv[1][0]!='-')
477 sprintf( in_file, "%s", argv[a]);
479 else if ( strcmp ( argv[a], "-in_f")==0 ||strm(argv[a],"-input") )
481 if ( strcmp ( argv[a], "-in_f")==0) fprintf ( stdout,"\nWARNING: %s deprecated, use -input instead", argv[a]);
483 sprintf ( in_format, "%s", argv[a+1]);
487 else if ( strcmp ( argv[a], "-cache")==0 )
489 sprintf (cache, "%s", argv[a+1]);
495 else if ( strcmp ( argv[a], "-exon_boundaries")==0 )
498 set_string_variable ("exon_boundaries", argv[a+1]);
501 else if ( strcmp ( argv[a], "-overaln_threshold")==0 )
504 set_int_variable ("overaln_threshold", atoi(argv[a+1]));
507 else if ( strcmp ( argv[a], "-overaln_target")==0 )
510 set_int_variable ("overaln_target", atoi(argv[a+1]));
513 else if ( strcmp ( argv[a], "-overaln_P1")==0 )
516 set_int_variable ("overaln_P1", atoi(argv[a+1]));
519 else if ( strcmp ( argv[a], "-overaln_P2")==0 )
522 set_int_variable ("overaln_P2", atoi(argv[a+1]));
525 else if ( strcmp ( argv[a], "-overaln_P3")==0 )
528 set_int_variable ("overaln_P3", atoi(argv[a+1]));
531 else if ( strcmp ( argv[a], "-overaln_P4")==0 )
534 set_int_variable ("overaln_P4", atoi(argv[a+1]));
538 else if ( strcmp ( argv[a], "-in2_f")==0||strm(argv[a],"-input2") )
540 if ( strcmp ( argv[a], "-in_f")==0) fprintf ( stdout,"\nWARNING: %s deprecated, use -input2 instead", argv[a]);
542 sprintf ( in2_format, "%s", argv[a+1]);
545 else if ( strcmp ( argv[a], "-seqnos")==0)
547 sprintf (action_list[n_actions++], "seqnos");
550 else if ( strcmp( argv[a], "-action")==0)
552 while ((a+1)<argc && argv[a+1][0]!='-')
554 sprintf (action_list[n_actions++], "%s", argv[a+1]);
558 else if ( strcmp ( argv[a], "-keep_case")==0)
560 if(!NEXT_ARG_IS_FLAG)RAD->keep_case=1;
561 else RAD->keep_case=(strm3(argv[a], "on","ON","On"))?1:0;
565 else if ( strcmp ( argv[a], "-conv")==0)
567 if ( strncmp ( argv[a+1],"set",3)==0)RAD->symbol_list=make_symbols (argv[++a],&(RAD->n_symbol));
570 RAD->symbol_list=declare_char (STRING, STRING);
571 while(!NEXT_ARG_IS_FLAG)
573 sprintf ( RAD->symbol_list[RAD->n_symbol], "%s", argv[++a]);
578 else if ( strcmp ( argv[a], "-struc_in_f")==0 ||strcmp ( argv[a], "-input3")==0 )
580 sprintf ( struc_in_format, "%s", argv[a+1]);
583 else if ( strcmp ( argv[a], "-out_f")==0 ||strm(argv[a],"-output") )
585 if ( strcmp ( argv[a], "-out_f")==0) fprintf (stdout, "\nWARNING: %s deprecated, use -output instead", argv[a]);
586 sprintf ( out_format, "%s", argv[a+1]);
589 else if ( strm ( argv[a], "-struc_out_f") || strm ( argv[a], "-output_struc") )
591 sprintf ( struc_out_format, "%s", argv[a+1]);
594 else if ( strcmp (argv[a],"-in")==0)
596 sprintf( in_file, "%s", argv[a+1]);
599 else if ( strcmp (argv[a],"-rename")==0)
601 sprintf( rename_file, "%s", argv[a+1]);
604 else if ( strcmp (argv[a],"-code")==0)
607 sprintf( rename_file, "%s", argv[a+1]);
610 else if ( strcmp (argv[a],"-decode")==0)
613 sprintf( rename_file, "%s", argv[a+1]);
616 else if ( strcmp (argv[a],"-in2")==0)
618 sprintf( in2_file, "%s", argv[a+1]);
621 else if ( strcmp (argv[a],"-coor")==0)
623 sprintf( RAD->coor_file, "%s", argv[a+1]);
626 else if (strcmp (argv[a],"-out")==0)
628 sprintf (out_file, "%s", argv[a+1]);
631 else if (strcmp (argv[a],"-out2")==0)
633 sprintf (out2_file, "%s", argv[a+1]);
636 else if ( strcmp (argv[a],"-struc_in")==0 || strcmp (argv[a],"-in3")==0 )
638 sprintf( struc_in_file, "%s", argv[a+1]);
641 else if (strcmp (argv[a],"-struc_out")==0)
643 sprintf (struc_out_file, "%s", argv[a+1]);
646 else if ( strcmp ( argv[a], "-rm_gap")==0)
650 else if ( strcmp ( argv[a], "-print_format")==0)
654 else if ( strcmp ( argv[a], "-no_warning")==0)
656 set_warning_mode (NO);
661 fprintf ( stdout, "\nUNKNOWN OPTION: %s", argv[a]);
662 myexit(EXIT_FAILURE);
665 /****************************************************************/
667 /* Data Preparation */
670 /****************************************************************/
672 prepare_cache (cache);
673 /****************************************************************/
678 /****************************************************************/
681 if ( strm (out_format, "hasch"))
683 fprintf ( stdout, "%d\n", (int)hash_file(in_file));
689 rename_list=read_rename_file ( rename_file,code);
693 if ((D1=read_data_structure (in_format, in_file,RAD))!=NULL)
695 in_format=(in_format && in_format[0])?in_format:identify_seq_format(in_file);
697 if (print_format)fprintf ( stdout, "\nFILE:%s FORMAT:%s\n", in_file, in_format);
699 else if ( in_file[0])
701 fprintf ( stdout, "\nFORMAT of file %s Not Supported[FATAL:%s]\n", in_file, PROGRAM);
702 myexit(EXIT_FAILURE);
705 if ((D2=read_data_structure (in2_format, in2_file,RAD))!=NULL){if (print_format)fprintf ( stderr, "\nFILE:%s FORMAT:%s\n", in2_file, (in2_format&&in2_format[0])?in2_format:identify_seq_format(in2_file));}
707 else if (!D2 && in2_file[0])
709 fprintf ( stderr, "\nFORMAT of file %s Not Supported [FATAL:%s]\n", in2_file, PROGRAM);
710 myexit(EXIT_FAILURE);
716 if ((D_ST=read_data_structure (struc_in_format, struc_in_file,RAD)))
726 entry=vcalloc ( LIST_N_FIELDS, sizeof (int));
728 for (a=0; a<CL->ne; a++)
730 entry=extract_entry (entry, a, CL);
731 if ( D_ST->S)(D_ST->S)->seq[entry[SEQ1]][entry[R1]-1]=entry[WE];
733 thread_seq_struc2aln (D_ST->A, D_ST->S);
735 else if ( name_is_in_list ("cons", ((D_ST)->A)->name, ((D_ST)->A)->nseq, 100));
738 D_ST->A=copy_aln ( D1->A, D_ST->A);
740 thread_seq_struc2aln (D_ST->A, D_ST->S);
743 else if ((strcmp (struc_in_format, "rna_number")==0) && in_file[0])
745 D_ST->RNA_ST=read_rna_struc_number((D1->A),struc_in_file);
747 else if ( struc_in_format[0] && struc_in_file[0])
750 fprintf ( stderr, "\nSTRUC %s UNKNOWN[FATAL]", struc_in_format);
751 myexit(EXIT_FAILURE);
755 D_ST=vcalloc ( 1, sizeof (Sequence_data_struc));
758 action=declare_char(100, 100);
759 for ( a=0; a< n_actions;)
761 if (action_list[a][0]!='+')
763 fprintf ( stderr, "\nWARNING: Action %s Unknown. Actions start with a +", action_list[a]);
764 myexit (EXIT_FAILURE);
769 sprintf ( action[b++], "%s", action_list[a++]+1);
770 while ( a<n_actions && action_list[a][0]!='+')sprintf ( action[b++], "%s", action_list[a++]);
771 modify_data( D1, D2, D_ST, action,b, RAD);
777 if (D1)D1->A= rename_seq_in_aln(D1->A, rename_list);
778 if (D2)D2->A=rename_seq_in_aln (D2->A, rename_list);
779 if (D_ST)D_ST->A=rename_seq_in_aln (D_ST->A,rename_list);
781 if (D1)D1->T =rename_seq_in_tree (D1->T, rename_list);
782 if (D2)D2->T =rename_seq_in_tree (D2->T, rename_list);
783 if (D_ST)D_ST->T=rename_seq_in_tree (D_ST->T,rename_list);
787 if ( !out_format[0] && ! struc_out_format[0])sprintf ( out_format, "%s", (in_format && in_format[0])?in_format:"fasta_aln");
788 main_output ( D1, D2, D_ST, out_format, out_file);
789 main_output ( D1, D2, D_ST, struc_out_format, struc_out_file);
796 /**************************************************************************************************/
797 /***************************** FORMAT GUESSING ******************************************/
798 /**************************************************************************************************/
799 Sequence_data_struc *read_data_structure ( char *in_format, char *in_file, Action_data_struc *RAD)
802 Sequence_data_struc *D;
803 char **seq_name=NULL, **sequences=NULL;
807 D=vcalloc ( 1, sizeof (Sequence_data_struc));
810 if (!in_file[0])return NULL;
813 in_format=identify_seq_format(in_file);
815 if (!in_format[0])return NULL;
819 D->A=declare_Alignment(NULL);
820 if ( RAD->keep_case)(D->A)->residue_case=KEEP_CASE;
822 D->rm_gap=RAD->rm_gap;
823 sprintf ( D->format, "%s", in_format);
824 sprintf ( D->file, "%s", in_file);
829 if ( strm2(in_format,"saga_aln","clustal_aln"))
831 read_aln (in_file, D->A);
836 else if ( strm (in_format, "treefile_list"))
839 D->S=get_tree_file_list(in_file);
840 D->A=seq2aln(D->S, D->A,NO_PAD);
842 else if ( strm (in_format, "file_list") || strm (in_format, "list"))
844 D->S=get_file_list(in_file);
845 D->A=seq2aln(D->S, D->A,KEEP_GAP);
847 else if ( strm (in_format, "fasta_tree"))
850 D->S=get_fasta_tree (in_file, NULL);
851 D->A=seq2aln(D->S, D->A,NO_PAD);
854 else if ( strm (in_format, "tree_list") || strm (in_format, "treelist"))
862 seq_file=vtmpnam(NULL);
863 seq=vfopen (seq_file, "w");
864 line=file2lines (in_file);
865 fp=vfopen (seq_file, "w");
866 for ( n=1; n<atoi(line[0]); n++)
868 fprintf ( fp, ">Tree_%d\n%s\n", n,line[n]);
872 free_char (line, -1);
873 return read_data_structure ( "fasta_tree",seq_file,RAD);
876 else if (strm (in_format, "matrix"))
878 D->M=read_matrice (in_file);
880 else if (strm4 (in_format, "newick_tree", "newick", "nh", "new_hampshire"))
882 D->T=main_read_tree (in_file);
883 D->S=tree2seq(D->T, NULL);
884 D->A=seq2aln (D->S,D->A, 0);
886 else if (strm (in_format, "blast_aln"))
888 if (read_blast_aln (in_file, D->A))
897 else if ( strm( in_format,"number_aln"))
899 read_number_aln (in_file, D->A);
902 else if ( strm( in_format,"stockholm_aln"))
904 read_stockholm_aln (in_file, D->A);
907 else if ( strm( in_format,"gotoh_aln"))
909 read_gotoh_aln (in_file, D->A);
913 else if ( strm ( in_format, "msf_aln"))
915 read_msf_aln (in_file, D->A);
918 else if ( strm ( in_format, "amps_aln"))
920 read_amps_aln (in_file, D->A);
923 else if ( strm (in_format, "excel_seq"))
925 D->S=perl_reformat2fasta ("excel2fasta.pl",in_file);
926 (D->S)->contains_gap=0;
927 D->A=seq2aln(D->S, D->A,RAD->rm_gap);
929 else if ( strm (in_format, "pavie_seq"))
931 D->S=perl_reformat2fasta ("pavie2fasta.pl",in_file);
932 (D->S)->contains_gap=0;
933 D->A=seq2aln(D->S, D->A,RAD->rm_gap);
935 else if ( strncmp (in_format, "perl_",5 )==0)
937 D->S=perl_reformat2fasta (in_format+5,in_file);
938 (D->S)->contains_gap=0;
939 D->A=seq2aln(D->S, D->A,RAD->rm_gap);
941 else if ( strm (in_format, "number_fasta"))
943 D->S=get_fasta_sequence_num (in_file, NULL);
944 (D->S)->contains_gap=0;
945 D->A=seq2aln(D->S, D->A,RAD->rm_gap);
947 else if ( strm (in_format, "raw_fasta"))
949 D->S=get_fasta_sequence_raw (in_file, NULL);
950 (D->S)->contains_gap=0;
951 D->A=seq2aln(D->S, D->A,RAD->rm_gap);
954 else if ( strm2 (in_format, "fasta_aln", "fasta_seq"))
957 D->S=get_fasta_sequence (in_file, NULL);
958 if ( strcmp (in_format, "fasta_aln")==0)(D->S)->contains_gap=0;
959 D->A=seq2aln(D->S, D->A,RAD->rm_gap);
961 else if ( strm (in_format, "fasta_tree"))
964 D->S=get_fasta_tree (in_file, NULL);
965 D->A=seq2aln(D->S, D->A, NO_PAD);
968 else if ( strm (in_format, "pdb") || strm (in_format, "pdb_struc"))
970 D->S=get_pdb_sequence (in_file);
973 add_warning (stderr, "FAILED TO find PDB File %s", in_file);
974 myexit (EXIT_FAILURE);
976 D->A=seq2aln(D->S, D->A,RAD->rm_gap);
978 else if ( strm2(in_format, "pir_seq", "pir_aln"))
980 D->S=get_pir_sequence ( in_file,NULL );
981 seq2aln(D->S, D->A, RAD->rm_gap);
983 else if ( strm(in_format, "gor_seq") )
985 D->S=get_gor_sequence ( in_file,NULL );
986 seq2aln(D->S, D->A, RAD->rm_gap);
988 else if ( strm2 ( in_format, "dali_aln", "dali_seq"))
990 D->S=get_sequence_dali ( in_file);
991 seq2aln(D->S, D->A, RAD->rm_gap);
993 else if ( strm (in_format, "barton_list_tc"))
995 get_barton_list_tc_seq ( in_file);
997 else if ( strm (in_format, "amps_sd_scores"))
999 D->W=get_amps_sd_scores ( in_file);
1002 else if ( strm ( in_format, "pima_aln"))
1004 D->S=get_pima_sequence ( in_file);
1005 seq2aln (D->S, D->A, RAD->rm_gap);
1007 else if ( strm( in_format, "gor_struc"))
1009 D->S=get_struc_gor ( in_file);
1010 seq2aln(D->S, D->A, RAD->rm_gap);
1012 else if ( strm( in_format, "dialign_aln"))
1014 D->S=get_dialign_sequence ( in_file);
1015 seq2aln (D->S, D->A, RAD->rm_gap);
1017 else if ( strm( in_format, "tc_lib") || strm( in_format, "mocca_lib") || strm( in_format, "lib"))
1019 read_seq_in_list (in_file,&nseq,&sequences,&seq_name);
1020 D->S=fill_sequence_struc ( nseq, sequences, seq_name);
1021 D->CL=declare_constraint_list ( D->S,NULL, NULL, 0,NULL, NULL);
1022 D->CL=read_constraint_list_file(D->CL,in_file);
1023 seq2aln (D->S, D->A, RAD->rm_gap);
1024 free_char (sequences,-1);
1025 free_char (seq_name, -1);
1027 else if ( strm( in_format,"swissprot_seq"))
1029 D->S=get_swissprot_sequence ( in_file,NULL);
1030 seq2aln (D->S, D->A, RAD->rm_gap);
1032 else if (strm (in_format, "alifold"))
1034 D->S=read_alifold ( in_file);
1035 seq2aln (D->S, D->A,0);
1044 for ( a=0; a<(D->A)->nseq; a++)sprintf ( (D->A)->file[a], "%s", in_file);
1048 for ( a=0; a<(D->A)->nseq; a++)sprintf ( (D->S)->file[a], "%s", in_file);
1053 Sequence *read_sequences (char *name)
1055 return main_read_seq (name);
1057 Alignment * alifold2aln (char *file)
1060 S=read_alifold(file);
1061 sprintf ( S->seq[0],"%s", S->seq[1]);
1062 return seq2aln (S, NULL, 0);
1064 Sequence * read_alifold (char *file)
1069 S=declare_sequence (1,count_n_char_in_file (file),2);
1070 list=file2lines (file);
1074 substitute (S->seq[0], "\n", "\0");
1075 substitute (S->seq[0], " ", "\0");
1076 substitute (S->seq[0], "_", STOCKHOLM_STRING);
1077 l=strlen (S->seq[0]);
1078 substitute (S->seq[1], "\n", "\0");
1079 substitute (S->seq[1], " ", "\0");
1080 substitute (S->seq[1], ".", STOCKHOLM_STRING);
1082 sprintf (S->name[0], "cons", file);
1083 sprintf (S->name[1], "#=GC SS_cons", file);
1092 Sequence * main_read_seq ( char *name)
1100 format=identify_seq_format (name);
1103 if ( getenv4debug ("DEBUG_REFORMAT"))fprintf ( stderr, "\n\nFormat %s\n", format);
1106 if (format &&strm(format, "fasta_seq"))
1108 S= get_fasta_sequence ( name, NULL);
1110 else if (format &&strm(format, "pir_seq")) S= get_pir_sequence ( name, NULL);
1111 else if (format &&strm(format,"swissprot_seq"))S= get_swissprot_sequence (name, NULL);
1112 else if (format && strstr (format, "aln"))
1114 A=main_read_aln ( name, NULL);
1119 else if ( format && strstr (format, "tc_lib"))
1122 char **sequences=NULL, **seq_name=NULL;
1124 read_seq_in_list (name,&nseq,&sequences,&seq_name);
1125 S=fill_sequence_struc ( nseq, sequences, seq_name);
1126 for ( b=0; b< S->nseq; b++)sprintf ( S->file[b], "%s",name);
1127 free_char (seq_name, -1);free_char (sequences, -1);
1131 /*Use The ClustalW routine*/
1132 S=cw_read_sequences (name);
1135 for ( a=0; a<S->nseq; a++)sprintf ( S->file[a], "%s", name);
1138 S=clean_sequence ( S);
1142 Alignment * main_read_aln ( char *name, Alignment *A)
1146 static char *format;
1151 if ( !name)return NULL;
1152 else if (!check_file_exists(name))
1154 if ( !check_file_exists (name+1))return NULL;
1155 else if ( name[0]=='A') name++;
1156 else if ( name[0]=='S') name++;/*Line Added for the -convert flag of T-Coffee*/
1160 if (!A)A=declare_aln(NULL);
1161 format=identify_seq_format (name);
1165 if ((format && strm(format, "saga_aln" )) ||strm(format, "clustal_aln")||strm(format, "t_coffee_aln" ) )
1168 read_aln ( name, A);
1171 else if (format && strm (format, "conc_aln"))A=input_conc_aln (name,NULL);
1172 else if (format &&strm(format, "msf_aln" ))read_msf_aln ( name, A);
1173 else if (format &&strm(format, "blast_aln"))read_blast_aln (name, A);
1174 else if (format &&(strm(format, "fasta_aln")))
1178 S=get_fasta_sequence ( name, NULL);
1183 else if (format &&strm(format, "pir_aln"))
1185 S=get_pir_sequence ( name, NULL);
1189 else if (format && strm(format, "fasta_seq") && A)
1191 S=get_fasta_sequence ( name, NULL);
1193 for ( a=1; a<S->nseq; a++)if ( strlen (S->seq[a-1])!=strlen (S->seq[a])){free_sequence (S, S->nseq); free_aln (A); return NULL;}
1198 else if (format && strm(format, "pir_seq") && A)
1200 S=get_pir_sequence ( name, NULL);
1202 for ( a=1; a<S->nseq; a++)if ( strlen (S->seq[a-1])!=strlen (S->seq[a])){free_sequence (S, S->nseq); free_aln (A); return NULL;}
1213 if ( check_list_for_dup( A->name, A->nseq))
1215 fprintf ( stderr, "\nWARNING (main_read_aln): %s is duplicated in File %s ", check_list_for_dup( A->name, A->nseq), A->file[0]);
1216 A=aln2unique_name_aln(A);
1219 if (IN_SEQ)A->S=IN_SEQ;
1220 else if (!A->S){A->S=aln2seq(A);}
1222 A->S=ungap_seq(A->S);
1223 A=fix_aln_seq(A, A->S);
1225 for ( a=0; a< A->nseq; a++) sprintf ( A->file[a], "%s", name);
1232 char * identify_aln_format ( char *file)
1234 /*This function identify known sequence and alignmnent formats*/
1235 return identify_seq_format (file);
1237 char * identify_seq_format ( char *file)
1240 /*This function identify known sequence and alignmnent formats*/
1242 if ( format==NULL)format=vcalloc ( 100, sizeof (char));
1243 else format[0]='\0';
1247 if ( !check_file_exists(file))
1249 fprintf (stderr, "ERROR: %s Does Not Exist [FATAL:%s]\n",file, PROGRAM);
1250 myexit (EXIT_FAILURE);
1252 else if ( is_stockholm_aln (file))sprintf (format, "stockholm_aln");
1253 else if ( is_blast_file (file))sprintf ( format, "blast_aln");
1254 else if ( is_pdb_file(file))sprintf ( format, "pdb_struc");
1255 else if ( format_is_msf (file))sprintf ( format, "msf_aln");
1256 else if ( format_is_fasta_seq(file))sprintf ( format, "fasta_seq");
1257 else if ( format_is_fasta_aln(file))sprintf ( format, "fasta_aln");
1258 else if ( format_is_pir_aln (file))sprintf ( format, "pir_aln");
1259 else if ( format_is_pir_seq (file))sprintf ( format, "pir_seq");
1260 else if ( format_is_oligo (file))sprintf ( format, "oligo_aln");
1261 else if ( format_is_swissprot (file))sprintf ( format, "swissprot_seq");
1262 else if ( format_is_saga (file))sprintf ( format, "clustal_aln");
1263 else if ( format_is_conc_aln (file))sprintf ( format, "conc_aln");
1264 else if ( is_lib (file))sprintf ( format, "tc_lib");
1265 else if ( is_lib_02 (file))sprintf ( format, "tc_lib_02");
1266 else if ( is_newick(file))sprintf ( format, "newick_tree");
1270 add_warning ( stderr, "\nThe Format of File: %s was not recognized [SERIOUS:%s]",file, PROGRAM);
1274 char **identify_list_format ( char **list, int n)
1283 declare_name (name);
1284 for ( a=0; a< n; a++)
1287 sprintf (name, "%s", list[a]);
1289 if ((mode=identify_format ( &string))!='?')
1291 sprintf ( name, "%s", string);
1292 sprintf ( list[a], "%c%s", mode,name);
1296 fprintf ( stderr, "\nERROR: %s not recognised [FATAL:%s]", name, PROGRAM);
1305 char * name2type_name ( char *name)
1307 /*turns <file> into <Sfile>, <Afile>...*/
1311 new_name=vcalloc ( strlen (name)+2, sizeof (char));
1312 sprintf ( new_name, "%s", name);
1313 if (is_in_set (name[0], "ALSMXPRW") && !check_file_exists(name))
1315 sprintf ( new_name, "%s", name);
1319 mode=identify_format (&new_name);
1320 sprintf ( new_name, "%c%s", mode,name);
1325 char identify_format (char **fname)
1330 if ((is_in_set (mode, "ALMSPR") && check_file_exists(fname[0]+1)) ||(mode=='X' && is_matrix ( fname[0]+1)) ||(mode=='M' && is_method(fname[0]+1)) )
1335 else if (mode=='W' && !check_file_exists(fname[0])){fname[0]++;}
1339 /*WARNING: Order matters => internal methods can be confused with files, must be checked last*/
1340 if (is_lib(fname[0]))mode='L';
1341 else if (is_pdb_file(fname[0]))mode='P';
1342 else if (is_seq(fname[0]))mode='S';
1343 else if (is_aln(fname[0]))mode='A';
1344 else if (is_matrix(fname[0]))mode='X';
1345 else if (is_method(fname[0]))mode='M';
1353 int is_pdb_name ( char *name)
1358 static char **buf_names;
1359 static int *buf_result;
1367 buf_names=declare_char (1000, 100);
1368 buf_result=vcalloc (1000, sizeof (int));
1370 if ( (result=name_is_in_list ( name, buf_names,nbuf,100))!=-1)return buf_result[result];
1374 result_file=vtmpnam (NULL);
1376 sprintf ( command, "extract_from_pdb -is_pdb_name \'%s\' > %s", name, result_file);
1377 if ( getenv4debug ("DEBUG_EXTRACT_FROM_PDB"))fprintf ( stderr, "\n[DEBUG_EXTRACT_FROM_PDB:is_pdb_name] %s\n", command);
1378 my_system ( command);
1380 fp=vfopen ( result_file, "r");
1381 fscanf ( fp, "%d", &result);
1383 vremove ( result_file);
1385 sprintf ( buf_names[nbuf], "%s", name);
1386 result=buf_result[nbuf++]=(result==1)?1:0;
1392 char* get_pdb_id ( char *file)
1394 /*receives the name of a pdb file*/
1395 /*reads the structure id in the header*/
1396 /*returns the pdb_id*/
1398 char command[10000];
1406 tmp_name=vtmpnam(NULL);
1408 sprintf ( cached, "%s/%s", get_cache_dir(),file);
1409 if ( check_file_exists(cached))sprintf ( fname, "%s", cached);
1410 else sprintf ( fname, "%s", file);
1412 sprintf ( command, "extract_from_pdb -get_pdb_id %s > %s",fname, tmp_name);
1414 if ( getenv4debug ("DEBUG_EXTRACT_FROM_PDB"))fprintf ( stderr, "\n[DEBUG_EXTRACT_FROM_PDB:get_pdb_id] %s\n", command);
1415 my_system ( command);
1418 fp=vfopen (tmp_name, "r");
1419 fscanf ( fp, "\n%s\n", buf);
1422 if ( getenv4debug ("DEBUG_EXTRACT_FROM_PDB"))fprintf ( stderr, "\n[DEBUG_EXTRACT_FROM_PDB:get_pdb_id]DONE\n");
1424 id=vcalloc ( strlen (buf)+1, sizeof (char));
1425 sprintf ( id, "%s", buf);
1433 char* get_pdb_struc(char *in_name, int start, int end)
1436 char command[LONG_STRING];
1442 name=vcalloc ( STRING, sizeof (char));
1443 sprintf ( name, "%s", in_name);
1445 if ( (name1=is_pdb_struc(name))==NULL && (name[0]=='P' && ((name1=is_pdb_struc (name+1))==NULL)))
1447 fprintf ( stderr, "\nERROR Could not download structure %s [FATAL:%s]\n", name, PROGRAM);crash("");
1449 else if ( (start==0) && (end==0))return name1;
1452 declare_name(name2);
1453 sprintf ( name2, "%s_%d_%d.pdb", name, start, end);
1454 sprintf ( command, "extract_from_pdb -infile \'%s\' -chain FIRST -coor %d %d > %s%s",check_file_exists(name1),start, end, get_cache_dir(),name2);
1455 if ( getenv4debug ("DEBUG_EXTRACT_FROM_PDB"))fprintf ( stderr, "\n[DEBUG_EXTRACT_FROM_PDB:get_pdb_struc] %s\n", command);
1456 my_system (command);
1458 if ( is_pdb_file(name2))return name2;
1461 fprintf ( stderr, "\nERROR Could not extract segment [%d %d] from structure %s [FATAL:%s]\n",start, end, name, PROGRAM);crash("");
1463 exit (EXIT_FAILURE);
1469 char* seq_is_pdb_struc ( Sequence *S, int i)
1472 if (!S){return NULL;}
1473 else if ( !S->T[i]){return NULL;}
1474 else if ( !((S->T[i])->P)){return NULL;}
1475 else return ((S->T[i])->P)->template_file;
1477 char* is_pdb_struc ( char *name)
1480 checks if this is the name of a local file that contains PDB data
1481 checks if this is the name of a file from a local db
1482 put the file in the cache
1483 checks if this is a file from a remote db (extract_from_pdb
1484 return NULL if everything fails
1487 static char *file_name1;
1488 static char *file_name2;
1489 static char **buf_names;
1490 static char **buf_result;
1498 if ( !name || name[0]=='\0')return NULL;
1505 buf_names=vcalloc ( 1000, sizeof (char*));
1506 buf_result=vcalloc ( 1000, sizeof (char*));
1507 file_name1=vcalloc ( 1000, sizeof (char));
1508 file_name2=vcalloc ( 1000, sizeof (char));
1510 if ( (s=name_is_in_list ( name, buf_names,nbuf,-1))!=-1)return buf_result[s];
1514 sprintf ( file_name1, "%s", name);
1515 sprintf ( file_name2, "%s.pdb", name);
1518 if (is_pdb_file(file_name1)){r=file_name1;}
1519 else if (is_pdb_file(file_name2)){r=file_name2;}
1520 else if (is_pdb_name (name))
1523 tmpname=vtmpnam (NULL);
1526 sprintf ( file_name2, "%s.pdb", name);
1527 /*sprintf ( command, "extract_from_pdb -netfile \'%s\' > %s%s 2>/dev/null",name, get_cache_dir(), file_name2);*/
1528 sprintf ( command, "extract_from_pdb -netfile \'%s\' > %s 2>/dev/null",name,tmpname);
1529 if ( getenv4debug ("DEBUG_EXTRACT_FROM_PDB"))fprintf ( stderr, "\n[DEBUG_EXTRACT_FROM_PDB:is_pdb_struc] %s\n", command);
1530 my_system (command);
1532 sprintf ( command, "cp %s %s%s", tmpname, get_cache_dir(), file_name2);
1533 my_system (command);
1535 if ( is_pdb_file(file_name2))r=file_name2;
1541 buf_names[nbuf]=vcalloc ( strlen (name)+1, sizeof (char));
1542 sprintf ( buf_names[nbuf], "%s", name);
1545 buf_result[nbuf]=vcalloc ( strlen (r)+1, sizeof (char));
1546 sprintf (buf_result[nbuf], "%s", r);
1548 else buf_result[nbuf]=NULL;
1554 char *fix_pdb_file ( char *in)
1558 empty=vcalloc(1, sizeof(char));
1560 if ( !in || !check_file_exists (in))return empty;
1561 else if ( is_pdb_file(in))return in;
1564 char command[10000];
1568 tmp2=vcalloc (strlen (tmp)+1, sizeof (char));
1569 sprintf (tmp2, "%s", tmp);
1570 sprintf ( command, "extract_from_pdb %s > %s", check_file_exists(in), tmp2);
1571 my_system (command);
1572 if ( is_pdb_file (tmp))return tmp2;
1578 int is_sap_file ( char *name)
1582 if (!check_file_exists(name))return 0;
1584 if ((fp=find_token_in_file (name, NULL, "Percent"))!=NULL)
1586 if ((fp=find_token_in_file (name,fp, "Percent"))!=NULL)
1603 int is_blast_file ( char *name)
1605 if ( !check_file_exists(name) ) return 0;
1606 else if (token_is_in_file (name, "<SequenceSimilaritySearchResult>"))
1612 if (token_is_in_file (name, "Lambda"))
1623 int is_simple_pdb_file ( char *name)
1626 if ((fp=find_token_in_file (name, NULL, "SIMPLE_PDB_FORMAT"))!=NULL){vfclose (fp);return 1;}
1631 int is_pdb_file ( char *name)
1636 if ( name==NULL) return 0;
1637 if (!check_file_exists (name))return 0;
1641 if ((fp=find_token_in_file (name, NULL, "\nHEADER"))!=NULL)
1645 if ((fp=find_token_in_file (name, NULL, "\nSEQRES"))!=NULL)
1651 if ((fp=find_token_in_file (name, NULL, "\nATOM"))!=NULL)
1664 if ( ispdb>=2)return 1;
1667 int is_seq ( char *name)
1671 if ( !check_file_exists(name))return 0;
1673 format= identify_seq_format(name);
1674 if(!format || format[0]=='\0'){vfree (format);return 0;}
1675 else if (strstr(format, "seq")){vfree (format);return 1;}
1678 int is_aln ( char *name)
1681 if ( !check_file_exists (name))return 0;
1683 format= identify_seq_format(name);
1684 if ( !format || format[0]=='\0'){vfree (format);return 0;}
1685 else if (strstr(format, "aln")){vfree (format); return 1;}
1689 int is_matrix (char *name)
1693 if ((m=read_matrice (name))!=NULL){free_int (m, -1); return 1;}
1696 int is_newick (char *name)
1702 fp=vfopen (name, "r");
1703 if ( (c=fgetc(fp))!='('){vfclose (fp); return 0;}
1706 while ( (c=fgetc(fp))!=EOF)
1708 if ( c==';'){vfclose (fp); return 1;}
1714 int is_clustalw_matrix ( char *name)
1720 if ( (fp=find_token_in_file (name, NULL, "CLUSTALW_MATRIX"))!=NULL){vfclose(fp);return 1;}
1723 int is_pavie_matrix ( char *name)
1729 if ( (fp=find_token_in_file (name, NULL, "PAVIE_MATRIX"))!=NULL){vfclose(fp);return 1;}
1732 int is_distance_matrix_file (char *name)
1735 if ( (fp=find_token_in_file (name, NULL, "TC_DISTANCE_MATRIX_FORMAT_01"))!=NULL){vfclose(fp);return 1;}
1738 int is_similarity_matrix_file (char *name)
1741 if ( (fp=find_token_in_file (name, NULL, "TC_SIMILARITY_MATRIX_FORMAT_01"))!=NULL){vfclose(fp);return 1;}
1744 int is_blast_matrix ( char *name)
1750 if ( (fp=find_token_in_file (name, NULL, "BLAST_MATRIX"))!=NULL){vfclose(fp);return 1;}
1754 int is_single_seq_weight_file ( char *name)
1758 return token_is_in_file ( name, "SINGLE_SEQ_WEIGHT_FORMAT_01");
1761 int is_stockholm_aln (char *file)
1765 if ((fp=find_token_in_file_nlines (file, NULL, "STOCKHOLM",2)))
1773 int is_lib ( char *name)
1775 return is_lib_01(name);
1778 int is_lib_02 ( char *name)
1781 return token_is_in_file ( name, "TC_LIB_FORMAT_02");
1785 int is_lib_01 (char *name)
1789 if ( token_is_in_file ( name, "TC_LIB_FORMAT_01")) return 1;
1790 else if (token_is_in_file ( name, "T-COFFEE_LIB_FORMAT_01"))return 1;
1791 else if (token_is_in_file (name, "SEQ_1_TO_N"))return 1;
1794 int is_lib_list ( char *name)
1796 if ( !check_file_exists (name))return 0;
1797 if ( token_is_in_file ( name, "TC_LIB_LIST_FORMAT_01")) return 1;
1800 int is_method ( char *file)
1805 sprintf ( new_file, "%s", file);
1806 if ( (token_is_in_file(new_file, "TC_METHOD_FORMAT_01"))){return 1;}
1807 if ( is_in_pre_set_method_list(new_file))
1810 vremove ( new_file);
1820 /*******************************************************************************************/
1823 /* SEQUENCE FORMAT IDENTIFIERS */
1825 /***************************************************************************************** */
1826 int type_is_exon_boundaries(char **seq, int n)
1833 if ( strchr ("bojBOJ", seq[a][b]))return 1;
1838 int format_is_oligo(char *file)
1844 fp=vfopen ( file, "r");
1849 if ( strm (buf, "ALPHABET"))r=1;
1855 int format_is_msf ( char *file)
1862 if ( (fp=find_token_in_file_nlines (file,NULL,"MSF:", 30))!=NULL){vfclose (fp);return 1;}
1868 fp=vfopen ( file, "r");
1869 fscanf (fp , "%s", buf);
1872 if ( strm (buf, "MSF:"))return 1;
1877 int format_is_fasta_aln ( char *file)
1880 if ( format_is_fasta(file) && !format_is_fasta_seq(file))return 1;
1885 int format_is_fasta_seq ( char *file)
1890 if ( format_is_fasta (file))
1892 S=get_fasta_sequence (file, NULL);
1894 else if ( !S->seq[0]){free_sequence (S, S->nseq); return 1;}
1895 l=strlen ( S->seq[0]);
1896 for ( a=0; a< S->nseq; a++)if(strlen(S->seq[a])!=l){free_sequence (S, S->nseq);return 1;}
1897 for ( a=0; a< S->nseq; a++)
1899 l1=strlen ( S->seq[a]);
1901 l2=strlen ( S->seq[a]);
1904 free_sequence (S, S->nseq);
1908 free_sequence (S, S->nseq);
1917 int format_is_fasta ( char *file)
1921 if ( !check_file_exists(file))return 0;
1923 if ( get_first_non_white_char (file)!='>')return 0;
1924 if ( !(S=get_fasta_sequence (file, NULL)))return 0;
1925 free_sequence (S, -1);
1926 if ( format_is_pir(file)) return 0;
1930 int format_is_pir_aln ( char *file)
1933 if ( format_is_pir(file) && !format_is_pir_seq(file))return 1;
1937 int format_is_pir_seq ( char *file)
1943 if ( format_is_pir (file))
1945 S=get_pir_sequence (file, NULL);
1946 for ( a=0; a< S->nseq; a++)
1948 l1=strlen ( S->seq[a]);
1950 l2=strlen ( S->seq[a]);
1953 free_sequence (S, S->nseq);
1966 int format_is_pir ( char *file)
1969 int pir_name=1, star_end=1, a;
1971 S=get_fasta_sequence (file, NULL);
1973 else if (!S->seq[0])return 0;
1975 pir_name=1; star_end=1;
1976 for (a=0; a< S->nseq; a++)
1979 if (!is_pir_name(S->name[a]))pir_name=0;
1980 l=strlen (S->seq[a]);
1981 if (!l || (l && S->seq[a][l-1]!='*'))
1984 free_sequence(S,-1);
1985 if ( pir_name && star_end) return 1;
1988 int is_pir_name (char *name)
1990 if ( strstr (name, "P1;"))return 1;
1991 if ( strstr (name, "F1;"))return 1;
1992 if ( strstr (name, "DL;"))return 1;
1993 if ( strstr (name, "DC;"))return 1;
1994 if ( strstr (name, "RL;"))return 1;
1995 if ( strstr (name, "RC;"))return 1;
1996 if ( strstr (name, "XX;"))return 1;
2001 int format_is_conc_aln (char *file)
2004 if ( (fp=find_token_in_file (file, NULL, "CONC_MSF_FORMAT_01"))){vfclose (fp); return 1;}
2007 int format_is_saga ( char *file)
2015 if ( (fp=find_token_in_file (file, NULL, "SAGA"))){vfclose (fp); return 1;}
2016 else if ((fp=find_token_in_file (file, NULL, "CLUSTAL"))){vfclose (fp); return 1;}
2017 else if ((fp=find_token_in_file (file, NULL, "ClustalW"))){vfclose (fp); return 1;}
2018 else if ((fp=find_token_in_file (file, NULL, "clustalw"))){vfclose (fp); return 1;}
2019 else if ((fp=find_token_in_file (file, NULL, "clustal"))){vfclose (fp); return 1;}
2020 else if ((fp=find_token_in_file (file, NULL, "T-COFFEE_MSA"))){vfclose (fp); return 1;}
2021 else if ((fp=find_token_in_file (file, NULL, "INTERLEAVED_MSA"))){vfclose (fp); return 1;}
2026 else if ((fp=find_token_in_file (file, NULL, "T-COFFEE"))){vfclose (fp); return 1;}
2027 else if ((fp=find_token_in_file (file, NULL, "SAGA_FORMAT"))){vfclose (fp); return 1;}
2028 else if ((fp=find_token_in_file (file, NULL, "GARP"))){vfclose (fp); return 1;}
2029 else if ((fp=find_token_in_file (file, NULL, "INTERLEAVED"))){vfclose (fp); return 1;}
2033 list=get_file_block_pattern (file,&n_blocks,100);
2034 if (n_blocks<=2){free_int (list, -1);return 0;}
2038 for ( a=1; a< n_blocks-1; a++)
2040 if ( list[a][0]!=n_seq){free_int (list, -1);return 0;}
2043 for ( b=1; b<=list[a][0]; b++)
2044 if ( list[a][b]!=2){free_int (list, -1);return 0;}
2055 int format_is_swissprot (char *name)
2059 if ( !check_file_exists(name))return 0;
2064 if ( (fp=find_token_in_file_nlines (name,NULL,"\nID ",10))!=NULL\
2065 &&(fp=find_token_in_file (name,NULL,"\nSQ "))!=NULL )
2068 vfclose (fp);return 1;
2076 /*******************************************************************************************/
2081 /***************************************************************************************** */
2082 int output_format_aln ( char *format, Alignment *inA, Alignment *inEA,char *name)
2084 Sequence_data_struc *D1=NULL;
2085 Sequence_data_struc *D2=NULL;
2090 A =copy_aln (inA, NULL);
2092 EA=copy_aln (inEA,NULL);
2094 EA=expand_number_aln(inA,EA);
2097 if (A && A->expanded_order )A=reorder_aln ( A, A->expanded_order,A->nseq);
2098 if (EA && EA->expanded_order)EA=reorder_aln ( EA, EA->expanded_order,EA->nseq);
2101 D1=vcalloc ( 1, sizeof (Sequence_data_struc));
2105 D2=vcalloc ( 1, sizeof (Sequence_data_struc));
2109 main_output ( D1, NULL,D2, format, name);
2117 int main_output (Sequence_data_struc *D1, Sequence_data_struc *D2, Sequence_data_struc *DST, char *out_format, char *out_file)
2125 if ( !out_format[0])return 0;
2126 if ( D1 && D1->rm_gap)ungap_aln ((D1->A));
2128 if ( (strstr (out_format, "expanded_")))
2131 out_format+=strlen ("expanded_");
2132 BUF_A=copy_aln (D1->A, NULL);
2133 (D1->A)=thread_profile_files2aln ((D1->A), NULL, NULL);
2137 if ( strm (out_format, ""))return 0;
2138 else if ( ( strm (out_format, "aln2lib")))
2142 Constraint_list *CL;
2150 pos=aln2pos_simple(IN, IN->nseq);
2151 fp=vfopen (out_file, "w");
2152 fp=save_list_header (fp,CL);
2155 for ( b=0; b< IN->nseq-1; b++)
2157 for ( c=b+1; c< IN->nseq; c++)
2161 fprintf ( fp, "#%d %d\n", s1+1, s2+1);
2162 for ( a=0; a< IN->len_aln; a++)
2167 if ( s1==s2 && !CL->do_self)continue;
2169 if ( s1< s2)s=(CL->evaluate_residue_pair)( CL, s1, r1, s2, r2);
2170 else s=(CL->evaluate_residue_pair)( CL, s2, r2, s1, r1);
2172 s=(s!=UNDEFINED)?s:0;
2175 fprintf (fp, "\t%5d %5d %5d \n", r1, r2, s);
2180 vfclose (save_list_footer (fp, CL));
2182 else if ( strncmp (out_format, "score",5)==0 || strm (out_format, "html"))
2189 fprintf ( stderr,"\n[You Need an evaluation File: Change the output format or use +evaluate][FATAL:%s]\n", PROGRAM);
2190 myexit(EXIT_FAILURE);
2192 if ( !strm ("html", out_format))while ( out_format[0]!='_' && out_format[0]!='\0' )out_format++;
2194 D1->S=aln2seq(D1->A);
2195 BUF=copy_aln (DST->A, NULL);
2196 DST->A=aln2number (DST->A);
2198 if ( strstr ( out_format, "html" ))output_reliability_html ( D1->A, DST->A, out_file);
2199 else if( strm ( out_format, "_ps" ))output_reliability_ps ( D1->A, DST->A, out_file);
2200 else if( strm ( out_format, "_pdf" ))output_reliability_pdf ( D1->A, DST->A, out_file);
2201 else if( strm ( out_format, "_ascii" ))output_reliability_ascii ( D1->A, DST->A, out_file);
2202 else if( strm ( out_format, "_seq" ))output_seq_reliability_ascii ( D1->A, DST->A, out_file);
2206 main_output (DST, NULL, NULL, out_format+1, out_file);
2209 else if (strm (out_format, "sec_html") || strm (out_format, "_E_html"))
2221 ST=copy_aln (A, NULL);
2222 for (a=0; a<ST->nseq; a++)
2224 i=name_is_in_list (ST->name[a],S->name, S->nseq, 100);
2227 buf=seq2E_template_string(S, i);
2228 if ( buf==NULL)continue;
2230 for (c=0,b=0; b<ST->len_aln; b++)
2233 r1=ST->seq_al[a][b];
2238 else if (s=='h')r1='9';
2239 else if (s=='c')r1='5';
2242 ST->seq_al[a][b]=r1;
2249 printf_exit ( EXIT_FAILURE, stderr, "\nYou must provide a TM template file [FATAL:%s]", PROGRAM);
2251 output_color_html ( A, ST, out_file);
2253 else if (strm (out_format, "tm_html") || strm (out_format, "_T_html"))
2265 ST=copy_aln (A, NULL);
2266 for (a=0; a<ST->nseq; a++)
2268 i=name_is_in_list (ST->name[a],S->name, S->nseq, 100);
2271 buf=seq2T_template_string(S, i);
2272 if ( buf==NULL)continue;
2274 for (c=0,b=0; b<ST->len_aln; b++)
2277 r1=ST->seq_al[a][b];
2282 else if (s=='h')r1='9';
2283 else if (s=='i')r1='5';
2286 ST->seq_al[a][b]=r1;
2293 printf_exit ( EXIT_FAILURE, stderr, "\nYou must provide a TM template file [FATAL:%s]", PROGRAM);
2295 output_color_html ( A, ST, out_file);
2298 else if (strm (out_format, "color_exoset"))
2300 Alignment *ST, *EX, *A;
2301 Constraint_list *CL;
2307 printf_exit ( EXIT_FAILURE, stderr, "\nYou must provide an obj file via the -struc_in flag [FATAL:%s]", PROGRAM);
2312 CL=declare_constraint_list ( DST->S,NULL, NULL, 0,NULL, read_matrice("pam250mt"));
2314 ST=copy_aln (A, NULL);
2315 buf=vcalloc ( EX->len_aln+1, sizeof (int));
2317 for ( a=0; a< A->nseq; a++)
2321 i=name_is_in_list (A->name[a],EX->name, EX->nseq, -1);
2322 if ( i==-1)continue;
2324 sprintf ( buf, "%s", EX->seq_al[i]);
2327 for (n=0,b=0; b<A->len_aln; b++)
2329 if (!is_gap(A->seq_al[a][b]))
2332 ST->seq_al[a][b]='0';
2333 else if ( buf[n]=='j')
2334 ST->seq_al[a][b]='1';
2335 else if ( buf[n]=='b')
2336 ST->seq_al[a][b]='2';
2343 output_color_html ( A, ST, out_file);
2344 return EXIT_SUCCESS;
2347 else if (strm (out_format, "color_protogene"))
2350 DST->A=copy_aln (D1->A, NULL);
2351 for (n=1,a=0; a< (D1->A)->len_aln; a++, n++)
2353 for ( b=0; b<(D1->A)->nseq; b++)
2355 if (is_gap((D1->A)->seq_al[b][a]));
2356 else if ( n<=3)(DST->A)->seq_al[b][a]=2;
2357 else if ( n>3)(DST->A)->seq_al[b][a]=9;
2362 output_color_html ( D1->A, DST->A, out_file);
2363 return EXIT_SUCCESS;
2366 else if ( strncmp (out_format, "color",5)==0)
2374 fprintf ( stderr,"\n[You Need an evaluation File: Change the output format or use +evaluate][FATAL:%s]\n", PROGRAM);
2375 myexit(EXIT_FAILURE);
2377 while ( out_format[0]!='_' && out_format[0]!='\0' )out_format++;
2379 BUF=copy_aln (DST->A, NULL);
2384 if ( strm ( out_format, "_html" ))output_color_html ( D1->A, DST->A, out_file);
2385 else if( strm ( out_format, "_ps" ))output_color_ps ( D1->A, DST->A, out_file);
2386 else if( strm ( out_format, "_pdf" ))output_color_pdf ( D1->A, DST->A, out_file);
2387 else if( strm ( out_format, "_ascii" ))output_color_ascii ( D1->A, DST->A, out_file);
2391 return main_output (DST, NULL, NULL, out_format+1, out_file);
2393 return EXIT_SUCCESS;
2395 else if ( strm4 ( out_format, "tc_aln","t_coffee_aln", "t_coffee", "tcoffee"))
2398 vfclose (output_aln ( D1->A, vfopen (out_file, "w")));
2400 else if ( strm ( out_format, "analyse_pdb"))
2405 fprintf ( stderr,"\n[You Need an evaluation File: Change the output format][FATAL:%s]\n", PROGRAM);
2406 myexit(EXIT_FAILURE);
2408 analyse_pdb ( D1->A,DST->A, "stdout");
2409 (DST->A)=aln2number (DST->A);
2410 output_reliability_ps ( D1->A, DST->A, out_file);
2412 else if ( strm4 ( out_format, "lower0", "lower1", "lower2", "lower3") || strm4(out_format, "lower4", "lower5", "lower6", "lower7") || strm4 (out_format,"lower8", "lower9", "align_pdb", "malign_pdb") )
2417 fprintf ( stderr,"\n[You Need an evaluation File: Change the output format][FATAL:%s]\n", PROGRAM);
2418 myexit(EXIT_FAILURE);
2423 (DST->A)=aln2number (DST->A);
2424 if ( strm (out_format, "align_pdb"))value=0;
2425 else if ( strm (out_format, "malign_pdb"))value=5;
2426 else value=atoi(out_format+5);
2428 D1->A=filter_aln_upper_lower (D1->A, DST->A,0, value);
2429 output_clustal_aln ( out_file, D1->A);
2431 else if ( strnm (out_format, "repeat", 6))
2438 size=atoi (out_format+6);
2440 CONC=declare_aln2 ( (D1->A)->nseq, ((D1->A)->len_aln+1)*size+1);
2442 for ( a=0; a< (D1->A)->nseq; a++)(D1->A)->seq_al[a][(D1->A)->len_aln]='\0';
2443 for ( c=0,a=0; a< (D1->A)->nseq;c++)
2446 sprintf ( CONC->name[c], "%s", (D1->A)->name[a]);
2447 for ( b=0; b<size; b++, a++)
2449 strcat (CONC->seq_al[c], (D1->A)->seq_al[a]);
2450 strcat (CONC->seq_al[c], "O");
2453 CONC->nseq=c;CONC->len_aln=strlen (CONC->seq_al[0]);
2454 output_clustal_aln ( out_file, CONC);
2458 else if ( strnm (out_format, "upper", 5))
2464 fprintf ( stderr,"\n[You Need an evaluation File: Change the output format][FATAL:%s]\n", PROGRAM);
2465 myexit(EXIT_FAILURE);
2469 (DST->A)=aln2number (DST->A);
2471 value=atoi(out_format+5);
2473 D1->A=filter_aln_lower_upper (D1->A, DST->A,0, value);
2474 output_clustal_aln ( out_file, D1->A);
2477 else if ( strm4 ( out_format, "filter0", "filter1", "filter2", "filter3"))
2482 fprintf ( stderr,"\n[You Need an evaluation File: Change the output format][FATAL:%s]\n", PROGRAM);
2483 myexit(EXIT_FAILURE);
2486 (DST->A)=aln2number (DST->A);
2488 D1->A=filter_aln (D1->A, DST->A, atoi(out_format+6));
2489 output_clustal_aln ( out_file, D1->A);
2492 else if ( strm3 ( out_format, "phylip_aln", "phylip", "phy"))
2495 output_phylip_aln ( out_file, D1->A);
2497 else if ( strm ( out_format, "mocca_aln"))
2500 output_mocca_aln ( out_file, D1->A, DST->A);
2502 else if ( strm ( out_format, "saga_pw_sd_weights") )
2505 output_pw_weights4saga ((D1->W),(D1->W)->PW_SD, out_file);
2507 else if ( strm ( out_format, "saga_aln"))
2510 output_saga_aln (out_file, D1->A);
2512 else if (strm2 ( out_format, "aln","clustal_tc")|| strm (out_format, "msa"))
2516 output_clustal_aln (out_file, D1->A);
2518 else if (strm5 ( out_format, "strict_clustal","clustal_aln", "clustalw","clustal", "clustalw_aln") || strm (out_format,"number_aln"))
2521 output_strict_clustal_aln (out_file, D1->A);
2523 else if ( strm ( out_format, "conc_aln"))
2526 output_conc_aln (out_file, D1->A);
2528 else if ( strm2 ( out_format, "lalign_aln","lalign"))
2531 output_lalign (out_file, D1->A);
2533 else if ( strm2 ( out_format, "glalign_aln","glalign"))
2536 output_glalign (out_file, D1->A, DST->A);
2539 else if ( strm2 ( out_format, "fasta_aln","fasta" ) || strm (out_format, "blast_aln"))
2542 output_fasta_aln( out_file, D1->A);
2544 else if ( strm (out_format, "overaln"))
2551 F=vcalloc (1, sizeof (OveralnP));
2553 string_array_upper ((D1->A)->seq_al, (D1->A)->nseq);
2556 D1->A=mark_exon_boundaries (D1->A, D2->A);
2559 else if ( (s=get_string_variable ("exon_boundaries")))
2563 EB=seq2aln(S=main_read_seq(s),NULL, 0);
2564 D1->A=mark_exon_boundaries (D1->A, EB);
2565 free_sequence (S, S->nseq); free_aln (EB);
2569 sprintf (F->mode, "%s", ((s=get_string_variable ("overaln_mode")))?s:"lower");
2570 if (!strm (F->mode, "lower") && !strm (F->mode, "unalign"))printf_exit (EXIT_FAILURE,stderr,"\nERROR: unknown overal_mode in overal output [%s] [FATAL:%s]", mode, PROGRAM);
2572 if (int_variable_isset ("overaln_threshold"))F->t=get_int_variable ("overaln_threshold");
2573 if (int_variable_isset ("overaln_target"))F->f=get_int_variable ("overaln_target");
2574 if (int_variable_isset ("overaln_P1"))F->p1=get_int_variable ("overaln_P1");
2575 if (int_variable_isset ("overaln_P2"))F->p2=get_int_variable ("overaln_P2");
2576 if (int_variable_isset ("overaln_P3"))F->p3=get_int_variable ("overaln_P3");
2577 if (int_variable_isset ("overaln_P4"))F->p4=get_int_variable ("overaln_P4");
2579 if (eb)sprintf (F->model, "fsa2");
2580 else sprintf (F->model, "fsa1");
2581 D1->A=aln2clean_pw_aln (D1->A, F);
2583 //if (eb)D1->A=aln2clean_pw_aln (D1->A, mode,t, f,p1,p2,p3, "fsa2");
2584 //else D1->A=aln2clean_pw_aln (D1->A, mode,t, f,p1,p2,p3, "fsa1");
2586 D1->S=aln2seq(D1->A);
2587 output_clustal_aln (out_file, D1->A);
2589 else if ( strm ( out_format, "est_prf" ))
2592 output_est_prf( out_file, D1->A);
2594 else if ( strm ( out_format, "clean_est_fasta_seq" ))
2597 D1->A=clean_est(D1->A);
2598 output_fasta_seq(out_file, D1->A);
2602 else if ( strm3 ( out_format, "msf_aln", "gcg", "msf"))
2605 output_msf_aln( out_file, D1->A);
2607 else if ( strm ( out_format, "rnalign"))
2610 output_rnalign (out_file, D1->A, DST->S);
2612 else if ( strm ( out_format, "fasta_seq") ||strm ( out_format, "list")||strm ( out_format, "file_list"))
2616 output_fasta_seq (out_file,D1->A);
2618 else if (strm (out_format, "fasta_tree") )
2621 output_fasta_tree (out_file,D1->A);
2624 else if ( strm ( out_format, "gotoh_seq"))
2627 output_gotoh_seq (out_file,D1->A);
2629 else if ( strm (out_format, "fasta_seq1"))
2632 output_fasta_seq1 (out_file, D1->A);
2634 else if ( strm2 (out_format, "pir_aln", "pir"))
2637 output_pir_aln (out_file, D1->A);
2639 else if ( strm (out_format, "pir_seq"))
2642 output_pir_seq (out_file, D1->A);
2644 else if ( strm (out_format, "gor_seq"))
2647 output_gor_seq (out_file, D1->A);
2649 else if ( strm (out_format, "pir_seq1"))
2652 output_pir_seq1 (out_file, D1->A);
2654 else if ( strm (out_format, "pw_lib_saga_aln"))
2657 output_pw_lib_saga_aln (out_file, D1->A);
2659 else if ( strm (out_format, "lib"))
2662 output_lib (out_file, D1->A);
2664 else if ( strm (out_format, "pdb_constraint_list"))
2667 output_constraints (out_file, "pdb",D1->A);
2669 else if ( strm2 (out_format, "constraint_list","tc_lib"))
2673 else if (!D1->CL)output_constraints (out_file,"sim", D1->A);
2674 else if (D1->CL) vfclose ( save_constraint_list ( D1->CL, 0, (D1->CL)->ne, out_file, NULL, "ascii",(D1->CL)->S));
2676 else if ( strm2 (out_format, "extended_lib","extended_cosmetic"))
2679 output_constraints (out_file,out_format, D1->A);
2681 else if ( strncmp (out_format, "extended_pair", 13)==0)
2684 output_constraints (out_file,out_format, D1->A);
2686 else if ( strm (out_format, "cache_id"))
2690 output_saga_aln (out_file, D1->A);
2692 else if ( strm (out_format, "compress_aln"))
2695 compress_aln (D1->A);
2696 output_saga_aln (out_file, D1->A);
2698 else if (strm (out_format, "n_seq") ||strm (out_format, "nseq") )
2701 fp=vfopen ( out_file, "w");
2702 fprintf ( fp, "%d\n", (D1->A)->nseq);
2706 else if ( strm ( out_format, "thread_dna_on_prot_aln"))
2709 D1->A=thread_dnaseq_on_prot_aln (D1->S, D2->A);
2710 output_saga_aln ( out_file, D1->A);
2712 else if ( strm ( out_format, "tdna_fasta_seq1"))
2714 D1->A=translate_dna_aln (D1->A,0);
2715 output_fasta_seq1 (out_file, D1->A);
2717 else if ( strm ( out_format, "tdna_aln"))
2719 D1->A=translate_dna_aln (D1->A,0);
2720 output_saga_aln ( out_file, D1->A);
2722 else if ( strm ( out_format, "cdna_fasta_seq1"))
2724 D1->A= gene2prot(D1->A);
2725 output_fasta_seq1 ( out_file, D1->A);
2727 else if ( strm ( out_format, "mutate_cdna_aln"))
2729 D1->A= mutate_cdna_aln ( D1->A);
2730 output_clustal_aln ( out_file, D1->A);
2732 else if ( strm ( out_format, "tdna_sp_aln"))
2736 fprintf ( stderr,"\n[You Need an evaluation File: Change the output format][FATAL:%s]\n", PROGRAM);
2737 myexit(EXIT_FAILURE);
2739 (DST->A)=aln2number (DST->A);
2740 D1->A=translate_splice_dna_aln (D1->A, DST->A);
2741 output_saga_aln ( out_file, D1->A);
2743 else if (out_format && out_format[0] && (strcmp ( out_format,"rna_graph_fasta")==0))
2746 sprintf ( (D1->A)->seq_al[0], "%s",(DST->S)->seq[0]);
2748 output_fasta_seq (out_file, DST->A);
2750 else if (strm ( out_format, "freq_mat"))
2753 output_freq_mat (out_file, D1->A);
2755 else if (strm ( out_format, "maln_pval"))
2757 output_maln_pval ( out_file, D1->A);
2759 else if ( strm ( out_format, "model_aln"))
2762 output_model_aln ( out_file, D1->A);
2764 else if (strncmp (out_format, "mult",4)==0)
2767 output_mult_fasta_seq ( out_file, D1->A, atoi(out_format+4));
2769 else if (strm (out_format, "conservation"))
2771 output_conservation_statistics (out_file, D1->A);
2773 else if (strm (out_format, "len"))
2776 output_statistics (out_file, D1->A, "nrl");
2778 else if ( strm (out_format, "name"))
2781 if ( D1->A)output_statistics (out_file, D1->A, "n");
2785 TS=tree2seq(D1->T, NULL);print_array_char (vfopen(out_file, "w"), TS->name, TS->nseq, "\n");
2788 else if ( strm (out_format, "code_name"))
2794 if ( D1->A){n=(D1->A)->nseq, nl=(D1->A)->name;}
2795 if ( D1->T){TS=tree2seq(D1->T, NULL);nl=TS->name;n=TS->nseq;}
2797 lfp=vfopen (out_file, "w");
2798 for ( num=0; num<n; num++)
2799 fprintf (lfp, "\n%s C%d", nl[num], num+1);
2800 fprintf (lfp, "\n");
2803 else if ( strm ( out_format, "seq2struc"))
2805 output_seq2struc (out_file, D1->A);
2807 else if ( strstr ( out_format, "pavie_age_channel"))
2809 output_n_pavie_age_channel ( D1->S,out_file, atoi((out_format+strlen ("pavie_age_channel"))));
2810 return EXIT_SUCCESS;
2812 else if ( strstr ( out_format, "age_matrix"))
2814 output_age_matrix (out_file, atoi((out_format+10)));
2816 else if ( strm ( out_format, "transitions"))
2818 output_transitions (out_file, D1->A);
2821 else if ( strncmp (out_format, "statistics",10)==0)
2825 output_statistics (out_file, D1->A,out_format+10);
2831 else if ( strm4 (out_format, "newick_tree","newick","binary","nh"))
2835 /*D1->T=unroot_tree(D1->T);*/
2836 vfclose (print_tree ((D1->T), out_format, vfopen ( out_file, "w")));
2838 else if ( strncmp (out_format, "sarsim", 6)==0)
2841 compare_sar_sequence (D1->S, (D2 &&D2->S)?D2->S:D1->S, atoi(out_format+6));
2842 return EXIT_SUCCESS;
2844 else if ( strncmp (out_format, "sim",3)==0)
2847 output_similarities (out_file, D1->A,out_format);
2850 else if ( strncmp (out_format, "cov",3)==0)
2853 output_similarities (out_file, D1->A,out_format);
2855 else if ( strm (out_format, "stockholm_aln"))
2857 output_stockholm_aln (out_file,D1->A, (D2)?D2->A:NULL);
2859 else if ( strm (out_format, "pair_sim"))
2863 fprintf ( stderr, "\n-output=pair_sim: provide aln1 via -in and aln2 via -in2 [FATAL:%s]\n", PROGRAM);
2864 myexit (EXIT_FAILURE);
2866 output_similarities_pw (out_file, D1->A,D2->A,out_format);
2868 else if ( strm (out_format, "matrix") || strm (out_format, "blast_matrix"))
2870 output_blast_mat (D1->M, out_file);
2875 fprintf ( stderr, "\n%s is an UNKNOWN OUTPUT FORMAT [FATAL:%s]\n",out_format, PROGRAM);
2876 myexit (EXIT_FAILURE);
2880 //Remove the expansion
2888 int is_in_format_list ( char *name)
2890 if ( strcmp ( name, "saga_aln")==0)return 1;
2891 if ( strcmp ( name, "number_aln")==0)return 1;
2892 if ( strcmp ( name, "clustal_aln")==0)return 1;
2893 if ( strcmp ( name, "fasta_aln")==0)return 1;
2894 if ( strcmp ( name, "number_fasta")==0)return 1;
2895 if ( strcmp ( name, "fasta_seq")==0)return 1;
2896 if ( strcmp ( name, "pdb")==0)return 1;
2897 if ( strcmp ( name, "msf_aln")==0)return 1;
2898 if ( strcmp ( name, "dali_aln")==0)return 1;
2899 if ( strcmp ( name, "dali_seq")==0)return 1;
2900 if ( strcmp ( name, "barton_list_tc")==0)return 1;
2901 if ( strcmp ( name, "est_prf")==0)return 1;
2903 if ( strcmp ( name, "gotoh_aln")==0)return 1;
2904 if ( strcmp ( name, "amps_aln")==0)return 1;
2905 if ( strcmp ( name, "pir_aln")==0)return 1;
2906 if ( strcmp ( name, "pir_seq")==0)return 1;
2907 if ( strcmp ( name, "est_fasta")==0)return 1;
2908 if ( strcmp ( name, "amps_sd_scores")==0)return 1;
2909 if ( strcmp ( name, "pima_aln")==0)return 1;
2910 if ( strcmp ( name, "dialign_aln")==0)return 1;
2911 if ( strcmp ( name, "gor_seq")==0)return 1;
2912 if ( strcmp ( name, "gor_struc")==0)return 1;
2913 if ( strcmp ( name, "stockholm_aln")==0)return 1;
2917 int is_struc_in_format_list ( char *name)
2919 if ( strcmp ( name, "rna_number")==0)return 1;
2920 if ( strcmp ( name, "fasta_seq")==0)return 1;
2923 char *format_name2aln_format_name (char *name)
2925 if ( strm (name, "gcg"))sprintf (name, "msf");
2926 else if ( strm (name, "fasta"))sprintf (name, "fasta_aln");
2929 int is_out_format_list ( char *name)
2931 return main_output (NULL, NULL, NULL, name, NULL);
2934 int is_struc_out_format_list ( char *name)
2936 return main_output (NULL, NULL, NULL, name, NULL);
2939 /**************************************************************************************************/
2940 /*************************************REFORMAT UTIL*************************************************/
2941 /**************************************************************************************************/
2943 /*************************************REFORMAT IN**************************************************/
2944 /**************************************************************************************************/
2945 /*******************************************************************************************/
2950 /***************************************************************************************** */
2952 /*******************************************************************************************/
2957 /***************************************************************************************** */
2959 Weights* get_amps_sd_scores ( char *fname)
2970 buf=vcalloc ( 1001, sizeof (char));
2971 buf2=vcalloc ( 1001, sizeof (char));
2973 fp=vfopen ( fname, "r");
2974 set_fp_id ( fp, "Index");
2975 buf=fgets ( buf, 1000, fp);
2976 fscanf ( fp, "%s", buf2);
2979 while ( isalnum(buf2[0]) && !isalpha(buf2[0]))
2982 buf=fgets ( buf, 1000, fp);
2983 fscanf ( fp, "%s", buf2);
2987 W=declare_weights (nseq);
2989 fp=vfopen ( fname, "r");
2990 set_fp_id ( fp, "Index");
2991 buf=fgets ( buf, 1000, fp);
2992 fscanf ( fp, "%s", buf2);
2995 while ( isalnum(buf2[0]) && !isalpha(buf2[0]))
2997 fp=set_fp_after_char (fp, '>');
2998 fscanf ( fp, "%s",W->seq_name[a]);
2999 buf=fgets ( buf, 1000, fp);
3000 fscanf ( fp, "%s", buf2);
3003 buf=fgets ( buf, 1000, fp);
3007 for ( e=0; e< 16; e++)
3009 c=fscanf ( fp, "%f", &array[e]);
3017 W->PW_ID[b][a]=W->PW_ID[a][b]=array[9];
3018 W->PW_SD[b][a]=W->PW_SD[a][b]=array[14];
3023 sprintf ( W->comments, "SD WEIGHTS GENERATED WITH THE PROGRAM AMPS IN PAIRWISE MODE");
3028 Weights *read_seq_weight (char **name, int nseq, char* seq_weight)
3035 char line[LONG_STRING];
3036 char sname[MAXNAMES];
3039 /*Read sequence weights:
3046 weights must be between 0 and 1;
3048 sequences not in S do not get any weight
3049 sequences in S but not in file get a weight of 1
3051 if ( !is_single_seq_weight_file (seq_weight))
3053 fprintf ( stderr, "\nERROR: File %s is not in Format SINGLE_SEQ_WEIGHT_FORMAT_01 [FATA:%s]", seq_weight,PROGRAM);
3054 myexit (EXIT_FAILURE);
3059 W=declare_weights(nseq);
3060 for ( a=0; a< nseq; a++)
3062 sprintf ( W->seq_name[a], "%s", name[a]);
3065 sprintf ( W->mode, "%s", seq_weight);
3066 fp=vfopen (seq_weight, "r");
3069 while ( fgets( line,LONG_STRING-1, fp))
3071 if ( line[0]=='*' ||line[0]=='#' || isblanc(line));
3074 if (sscanf(line, "%s %f", sname, &w)!=2)continue;
3075 if ( (p=name_is_in_list ( sname, W->seq_name, nseq, MAXNAMES-1))!=-1)
3087 /*******************************************************************************************/
3092 /***************************************************************************************** */
3094 char *** read_rename_file ( char *fname, int code)
3098 char ***convert=NULL;
3100 convert=declare_arrayN(3, sizeof (char),count_n_line_in_file(fname) +1,2,MAXNAMES+1);
3101 fp=vfopen (fname, "r");
3103 if ( code==CODE) while ( fscanf ( fp, "%s %s\n", convert[n][0], convert[n][1])==2)n++;
3104 else if (code==DECODE)while ( fscanf ( fp, "%s %s\n", convert[n][1], convert[n][0])==2)n++;
3109 void get_barton_list_tc_seq ( char *in_file)
3111 FILE *fp, *fp_make, *fp_length, *fp_long;
3125 length=vcalloc ( 1000, sizeof(int));
3126 if ( buf==NULL)buf=vcalloc ( len_buf, sizeof (char));
3127 fp=vfopen (in_file, "r");
3128 fp_long=vfopen ( "barton_seq_list_large", "w");
3129 fp_make=vfopen ( "make_dir", "w");
3130 fp_length=vfopen ( "barton_length", "w");
3131 for ( a=0; a< 9; a++)
3133 sprintf ( name, "barton_nseq%d",a);
3134 fp_small[a]=vfopen ( name, "w");
3141 while ( (c=fgetc(fp))!='#');
3142 while ( (c=fgetc(fp))=='#');
3144 while ( (c=fgetc(fp))!='#')buf[a++]=c;
3147 sprintf ( name, "%s", buf);
3149 while ( (c=fgetc(fp))=='#');
3154 while ( (c=fgetc(fp))!='#' && c!=EOF)
3160 buf=vrealloc ( buf, len_buf*sizeof (char));
3167 nseq=process_barton_entry ( buf,name);
3169 longest=(longest<nseq)?nseq:longest;
3171 if ( nseq<=8) fprintf ( fp_small[nseq], "%s.pep\n", name);
3172 else fprintf ( fp_long, "%s.pep\n",name);
3173 fprintf ( fp_make, "mkdir %s\nmv %s.pep %s\nmv %s.check %s\n", name, name, name, name, name);
3183 for ( a=0; a< 9; a++)vfclose (fp_small[a]);
3185 for ( a=0; a<= longest; a++)fprintf ( fp_length, "%d: %d\n", a, length[a]);
3186 vfclose ( fp_length);
3190 int process_barton_entry (char *buf, char *name)
3200 int min_len_seq=999999;
3207 sprintf ( fname, "%s.pep", name);
3208 sprintf ( com_name, "%s.check",name);
3210 if ( buf2==NULL)buf2=vcalloc ( 10000, sizeof (char));
3212 while (buf[a]!='\0')
3216 a=get_string_line (a,2, buf, buf2);
3217 while ((c=buf[a++])!='*')
3218 if (isalnum (c)|| c=='.' || c=='-')
3220 max_len_seq=(clen> max_len_seq)?clen: max_len_seq;
3221 min_len_seq=(clen< min_len_seq)?clen: min_len_seq;
3225 if ( buf[a]!='\0')a++;
3229 LS=declare_sequence ( min_len_seq, max_len_seq, nseq);
3233 for (a=0, current=0; current< nseq; current++)
3235 a=get_string_line ( a, 1, buf, buf2);
3236 sscanf ( buf2, ">P1;%s", LS->name[current]);
3237 a=get_string_line ( a, 1, buf, buf2);
3240 sprintf ( LS->seq_comment[current], buf2);
3243 while ( (c=buf[a++])!='*')
3246 LS->seq[current][p++]=tolower (c);
3247 else if ( isgraph(c))
3248 LS->seq[current][p++]=(c);
3253 LA=declare_Alignment(LS);
3254 seq2aln ( LS, LA,rm_gap);
3255 output_fasta_seq (fname,LA);
3256 output_pir_check (com_name,LA->nseq, LA->seq_comment);
3257 free_Alignment ( LA);
3258 free_sequence ( LS, nseq);
3266 Structure *read_rna_struc_number (Alignment *A,char *fname)
3276 SA=declare_sequence ( A->len_aln, A->len_aln, 1);
3277 SA->len[0]=A->len[0];
3278 for ( a=0; a< SA->len[0]; a++)
3280 ST=declare_rna_structure_num (SA);
3283 fp=vfopen ( fname, "r");
3284 fscanf ( fp, "%c\n%d\n",&x, &(ST)->tot_list);
3285 for ( a=0; a<(ST)->tot_list; a++)
3287 fscanf ( fp, "%d %d %d %c %c %f\n", &(ST)->list[a][0],&(ST)->list[a][1],&(ST)->list[a][2], &x, &y, &f);
3293 (ST)->stem[0][0]=(ST)->list[a][0];
3296 else if ( (ST)->stem[(ST)->tot_stem][0]==(ST)->list[a][0]);
3297 else if ( (ST)->stem[(ST)->tot_stem][0]!=(ST)->list[a][0])
3299 (ST)->stem[(ST)->tot_stem][2]=a-1;
3301 (ST)->stem[(ST)->tot_stem][0]=(ST)->list[a][0];
3302 (ST)->stem[(ST)->tot_stem][1]=a;
3305 SA->seq[0][(ST)->list[a][1]]='-';
3306 SA->seq[0][(ST)->list[a][2]]='-';
3308 (ST)->stem[(ST)->tot_stem][2]=a-1;
3310 for ( a=0; a< (ST)->tot_stem; a++)
3313 first=(ST)->stem[a][1];
3314 last=(ST)->stem[a][2];
3315 SA->seq[0][(ST)->list[first][1]]='>';
3316 SA->seq[0][(ST)->list[first][2]]='<';
3317 SA->seq[0][(ST)->list[last][1]]='>';
3318 SA->seq[0][(ST)->list[last][2]]='<';
3324 Structure * declare_rna_structure_num (Sequence *SA)
3327 ST=vcalloc ( 1, sizeof ( Structure));
3328 ST->list=declare_int ( SA->len[0], 3);
3329 ST->stem=declare_int ( SA->len[0], 3);
3332 char ** read_lib_list (char *name, int *n)
3339 lines=file2lines (name);
3342 list=vcalloc (l, sizeof (char*));
3343 for ( n[0]=0,a=1; a<l; a++,b++)
3344 if ( !strstr (lines[a], "TC_LIB_LIST_FORMAT_01"))list[n[0]++]=lines[a];
3349 /*******************************************************************************************/
3354 /***************************************************************************************** */
3355 char ***read_group ( char *file)
3357 /*Format: Fasta like, the name fo the group followed with the name of the sequences
3358 ><Group name> <First Seq> <second seq> ....
3359 Groups must NOT be overlaping
3360 list[group_index][0]="number of sequences"
3361 list[group_index][1]="group name"
3362 list[group_index][2...N]="sequence"
3372 l=measure_longest_line_in_file (file)+1;
3373 buf=vcalloc (l, sizeof (char));
3374 list=vcalloc ( count_n_line_in_file (file )+1, sizeof (char**));
3376 fp=vfopen (file, "r");
3379 while ((c=fgetc(fp))!=EOF)
3381 buf=fgets (buf,l-1, fp);
3382 if ( c=='>')list[a++]=string2list (buf);
3388 static Sequence* get_pdb_sequence_from_field (char *fname, char *field);
3389 Sequence* get_pdb_sequence (char *fname)
3394 if ( (S=get_pdb_sequence_from_field(fname, "SEQRES"))!=NULL);
3395 else if ( (S=get_pdb_sequence_from_field(fname, "ATOM"))!=NULL)
3397 add_warning (stderr,"Warning: Read Sequence from ATOM field in %s [%s:WARNING]", fname, PROGRAM);
3401 add_warning ( stderr, "\nWARNING: failed to extract sequence from %s [%s:WARNING]\n", fname, PROGRAM);
3406 static Sequence* get_pdb_sequence_from_field (char *fname, char *field)
3414 command=vcalloc ( LONG_STRING, sizeof (char));
3415 tp_name=vtmpnam (NULL);
3417 sprintf ( command, "extract_from_pdb -seq_field %s -chain FIRST -infile \'%s\' -mode fasta > %s", field, check_file_exists(fname), tp_name);
3418 if ( getenv4debug ("DEBUG_EXTRACT_FROM_PDB"))fprintf ( stderr, "\n[DEBUG_EXTRACT_FROM_PDB:get_pdb_seq] %s\n", command);
3419 my_system ( command);
3422 S=get_fasta_sequence ( tp_name, NULL);
3423 if (S==NULL)return NULL;
3425 if ( (pdbid=get_pdb_id (fname))){sprintf ( S->name[0], "%s",pdbid);vfree (pdbid);}
3428 sprintf ( S->file[0], "%s", fname);
3429 S->max_len=S->min_len=S->len[0];
3432 free_sequence (S, -1);
3442 char * get_pdb_file ( char *fname)
3450 file=vcalloc ( sizeof (char),count_n_char_in_file ( fname)+1);
3451 fp=vfopen ( fname, "r");
3452 while ( (c=fgetc(fp))!=EOF)file[a++]=c;
3457 Sequence* get_struc_gor ( char *fname)
3459 int nseq, min_len, max_len;
3469 fp=vfopen ( fname, "r");
3471 while ( (c=fgetc(fp))!=EOF)
3477 fscanf ( fp, "%s %d", name, &len);
3478 if (min_len==-1)min_len=max_len=len;
3481 min_len=(len>min_len)?min_len:len;
3482 max_len=(len>max_len)?len:max_len;
3489 S=declare_sequence ( min_len, max_len+1,nseq);
3492 fp=vfopen (fname,"r");
3493 while ( (c=fgetc(fp))!=EOF)
3498 fscanf ( fp, "%s %d\n",S->name[S->nseq], &(S->len[S->nseq]));
3500 while ( (c=fgetc(fp))!='\n');
3502 for ( a=0; a<S->len[S->nseq]; a++)
3503 fscanf ( fp, " %*c %c %*f %*f %*f\n",&(S->seq[S->nseq][a]));
3505 S->seq[S->nseq][a]='\0';
3506 while ( (c=fgetc(fp))!='!' && c!=EOF);
3516 Sequence* get_sequence_dali (char *fname)
3527 int min_len_seq=999999;
3530 if ((fp=vfopen (fname,"r"))==NULL)
3531 {printf ( "\nCOULDN'T OPEN %s",fname);
3532 myexit(EXIT_FAILURE);
3540 fscanf (fp, "%s",name);
3541 while (!isdigit(c=fgetc(fp)) && c!=EOF)
3542 if (isalnum (c) || c=='.' || c=='-')
3544 max_len_seq=(clen> max_len_seq)?clen: max_len_seq;
3545 min_len_seq=(clen< min_len_seq)?clen: min_len_seq;
3554 LS=declare_sequence ( min_len_seq, max_len_seq+1,nseq);
3557 fp=vfopen (fname,"r");
3566 fscanf_seq_name (fp, LS->name[current]);
3568 while (!isdigit(c=fgetc(fp)) && c!=EOF)
3571 LS->seq[current][p++]=tolower (c);
3573 LS->seq[current][p++]='-';
3575 LS->seq[current][p++]='-';
3577 LS->seq[current][p]='\0';
3578 LS->len[current]=strlen ( LS->seq[current]);
3591 Sequence* get_dialign_sequence (char *fname)
3602 int min_len_seq=999999;
3606 buf=vcalloc ( 1000, sizeof (char));
3607 if ((fp=vfopen (fname,"r"))==NULL)
3608 {printf ( "\nCOULDN'T OPEN %s",fname);
3609 myexit(EXIT_FAILURE);
3615 {fscanf (fp, "%s",name);
3617 buf=fgets ( buf, 1000, fp);
3618 while ((c=fgetc(fp))!='>' && c!=EOF && c!=' ' && c!='\t')
3619 if (isalnum (c)|| is_gap(c))
3621 max_len_seq=(clen> max_len_seq)?clen: max_len_seq;
3622 min_len_seq=(clen< min_len_seq)?clen: min_len_seq;
3631 LS=declare_sequence ( min_len_seq, max_len_seq, nseq);
3634 fp=vfopen (fname,"r");
3643 fscanf_seq_name (fp, LS->name[current]);
3644 l=strlen ( LS->name[current]);
3645 if ( LS->name[current][l-1]==','||LS->name[current][l-1]==',')LS->name[current][l-1]='\0';
3646 buf=fgets ( buf, 1000, fp);
3648 while ((c=fgetc(fp))!='>' && c!=EOF && c!=EOF && c!=' ' && c!='\t')
3650 LS->seq[current][p++]=tolower (c);
3651 else if ( isgraph(c))
3652 LS->seq[current][p++]=(c);
3653 LS->seq[current][p]='\0';
3654 LS->len[current]=strlen ( LS->seq[current]);
3665 Sequence* get_pima_sequence (char *fname)
3677 int min_len_seq=999999;
3678 int nseq=0, l=0, len=0;
3682 sprintf ( prefix, "%s",fname);
3684 buf=strstr(prefix, "-");
3686 len=strlen (prefix);
3690 buf=vcalloc ( 1000, sizeof (char));
3691 if ((fp=vfopen (fname,"r"))==NULL)
3692 {printf ( "\nCOULDN'T OPEN %s",fname);
3693 myexit(EXIT_FAILURE);
3700 fscanf_seq_name (fp,name);
3701 if ( strlen(name)>=len && strncmp ( name, prefix, len)==0)
3708 buf=fgets ( buf, 1000, fp);
3709 while ((c=fgetc(fp))!='>' && c!=EOF)
3710 if (isalnum (c)|| is_gap(c))
3712 max_len_seq=(clen> max_len_seq)?clen: max_len_seq;
3713 min_len_seq=(clen< min_len_seq)?clen: min_len_seq;
3723 LS=declare_sequence ( min_len_seq, max_len_seq, nseq);
3726 fp=vfopen (fname,"r");
3734 fscanf_seq_name (fp,LS->name[current]);
3735 if ( strlen(LS->name[current])>=len && strncmp ( LS->name[current], prefix, len)==0)
3739 buf2=strstr (LS->name[current], ".");
3740 if ( buf2!=NULL) buf2[0]='\0';
3742 l=strlen ( LS->name[current]);
3743 if ( LS->name[current][l-1]==','||LS->name[current][l-1]==',')LS->name[current][l-1]='\0';
3744 buf=fgets ( buf, 1000, fp);
3746 while ((c=fgetc(fp))!='>' && c!=EOF)
3748 LS->seq[current][p++]=tolower (c);
3749 else if ( isgraph(c))
3750 LS->seq[current][p++]=(c);
3751 LS->seq[current][p]='\0';
3752 LS->len[current]=strlen ( LS->seq[current]);
3764 Sequence* perl_reformat2fasta (char *perl_command, char *fname)
3769 file=vtmpnam (NULL);
3771 check_program_is_installed ( perl_command,"", perl_command,EMAIL,IS_FATAL);
3772 sprintf ( command, "%s %s > %s", perl_command, fname, file);
3773 my_system ( command);
3774 return get_fasta_sequence (file, NULL);
3776 Sequence* get_fasta_sequence_num (char *fname, char *comment_out)
3798 buffer=vcalloc (1000, sizeof (char));
3799 name=vcalloc ( 100, sizeof (char));
3801 nseq=count_n_char_x_in_file(fname, '>');
3802 min_len_seq=max=count_n_char_in_file(fname);
3803 sub=vcalloc (max+1, sizeof (int));
3805 fp=vfopen (fname,"r");
3813 fscanf_seq_name (fp,name);
3814 while ((c=fgetc(fp))!='\n' && c!=EOF);
3815 while ((c=fgetc(fp))!='>' && c!=EOF)
3816 if (isalnum (c)|| is_gap(c))
3818 max_len_seq=(clen> max_len_seq)?clen: max_len_seq;
3819 min_len_seq=(clen< min_len_seq)?clen: min_len_seq;
3828 LS=declare_sequence ( min_len_seq, max_len_seq,nseq);
3832 fp=vfopen (fname,"r");
3840 fscanf_seq_name (fp,LS->name[current]);
3841 l=strlen ( LS->name[current]);
3842 if ( LS->name[current][l-1]==','||LS->name[current][l-1]==';')LS->name[current][l-1]='\0';
3843 LS->name[current]=translate_name ( LS->name[current]);
3845 while ((c=fgetc(fp))!='\n' && c!=EOF && a<(COMMENT_SIZE-1))LS->seq_comment[current][a++]=c;
3846 LS->seq_comment[current][a]='\0';
3850 while ((c=fgetc(fp))!='>' && c!=EOF)
3853 LS->seq[current][p++]=c;
3855 LS->seq[current][p++]=c;
3858 LS->seq[current][p]='\0';
3859 LS->len[current]=strlen ( LS->seq[current]);
3878 Sequence *get_tree_file_list ( char *fname)
3887 list=file2list (fname, "\n");
3888 fp=vfopen (tmp, "w");
3890 while (list[a] && !isspace(list[a][1][0]))
3893 s=file2string (list[a][1]);
3894 fprintf ( fp, ">%s\n%s\n", list[a][1], (s)?s:"");
3898 free_arrayN((void ***)list, 3);
3899 return get_fasta_tree (tmp, NULL);
3901 Sequence *get_file_list ( char *fname)
3910 list=file2list (fname, "\n");
3911 fp=vfopen (tmp, "w");
3913 while (list[a] && !isspace(list[a][1][0]))
3916 fprintf ( fp, ">%s\n", list[a][1]);
3920 free_arrayN((void ***)list, 3);
3921 return get_fasta_sequence (tmp, NULL);
3923 Sequence*get_fasta_tree (char *fname, char *comment_out)
3945 buffer=vcalloc (1000, sizeof (char));
3946 name=vcalloc ( 100, sizeof (char));
3948 nseq=count_n_char_x_in_file(fname, '>');
3949 min_len_seq=max=count_n_char_in_file(fname);
3950 sub=vcalloc (max+1, sizeof (int));
3952 fp=vfopen (fname,"r");
3960 fscanf_seq_name (fp,name);
3961 while ((c=fgetc(fp))!='\n' && c!=EOF);
3962 while ((c=fgetc(fp))!='>' && c!=EOF)
3965 max_len_seq=(clen> max_len_seq)?clen: max_len_seq;
3966 min_len_seq=(clen< min_len_seq)?clen: min_len_seq;
3975 LS=declare_sequence ( min_len_seq, max_len_seq,nseq);
3979 fp=vfopen (fname,"r");
3987 fscanf_seq_name (fp,LS->name[current]);
3988 l=strlen ( LS->name[current]);
3989 if ( LS->name[current][l-1]==','||LS->name[current][l-1]==';')LS->name[current][l-1]='\0';
3990 LS->name[current]=translate_name ( LS->name[current]);
3992 while ((c=fgetc(fp))!='\n' && c!=EOF && a<(COMMENT_SIZE-1))LS->seq_comment[current][a++]=c;
3993 LS->seq_comment[current][a]='\0';
3997 while ((c=fgetc(fp))!='>' && c!=EOF)
3999 LS->seq[current][p++]=c;
4002 LS->seq[current][p]='\0';
4003 LS->len[current]=strlen ( LS->seq[current]);
4023 Sequence* get_fasta_sequence_raw (char *fname, char *comment_out)
4045 buffer=vcalloc (1000, sizeof (char));
4046 name=vcalloc ( 100, sizeof (char));
4048 nseq=count_n_char_x_in_file(fname, '>');
4049 min_len_seq=max=count_n_char_in_file(fname);
4050 sub=vcalloc (max+1, sizeof (int));
4052 fp=vfopen (fname,"r");
4060 fscanf_seq_name (fp,name);
4061 while ((c=fgetc(fp))!='\n' && c!=EOF);
4062 while ((c=fgetc(fp))!='>' && c!=EOF)
4065 max_len_seq=(clen> max_len_seq)?clen: max_len_seq;
4066 min_len_seq=(clen< min_len_seq)?clen: min_len_seq;
4075 LS=declare_sequence ( min_len_seq, max_len_seq,nseq);
4079 fp=vfopen (fname,"r");
4087 fscanf_seq_name (fp,LS->name[current]);
4088 l=strlen ( LS->name[current]);
4089 if ( LS->name[current][l-1]==','||LS->name[current][l-1]==';')LS->name[current][l-1]='\0';
4090 LS->name[current]=translate_name ( LS->name[current]);
4092 while ((c=fgetc(fp))!='\n' && c!=EOF && a<(COMMENT_SIZE-1))LS->seq_comment[current][a++]=c;
4093 LS->seq_comment[current][a]='\0';
4097 while ((c=fgetc(fp))!='>' && c!=EOF)
4100 if (c!='\n')LS->seq[current][p++]=c;
4103 LS->seq[current][p]='\0';
4104 LS->len[current]=strlen ( LS->seq[current]);
4123 Sequence* get_fasta_sequence (char *fname, char *comment_out)
4149 buffer=vcalloc (1000, sizeof (char));
4150 name=vcalloc ( 10000, sizeof (char));
4152 nseq=count_n_char_x_in_file(fname, '>');
4153 if (disk==1 || get_int_variable ("use_disk") || getenv ("SEQ_ON_DISK_4_TCOFFEE")){disk=1;}
4156 vfree (buffer); vfree (name);
4160 min_len_seq=max=count_n_char_in_file(fname);
4161 sub=vcalloc (max+1, sizeof (char));
4163 fp=vfopen (fname,"r");
4172 fscanf_seq_name (fp,name);
4173 while ((c=fgetc(fp))!='\n' && c!=EOF);
4174 while ((c=fgetc(fp))!='>' && c!=EOF)
4176 if (isalnum (c)|| is_gap(c))
4180 if (strm (sub, "PDB"))
4182 pdb_name=get_pdb_struc(name,0, 0);
4183 pdb_S=get_pdb_sequence (pdb_name);
4186 clen=strlen( pdb_S->seq[0]);
4187 free_sequence ( pdb_S,1);
4194 max_len_seq=(clen> max_len_seq)?clen: max_len_seq;
4195 min_len_seq=(clen< min_len_seq)?clen: min_len_seq;
4207 LS=declare_sequence ( min_len_seq, max_len_seq,nseq);
4210 LS=declare_sequence (0,0,nseq);
4211 for (a=0; a<nseq; a++)LS->seq[a]=NULL;
4215 fp=vfopen (fname,"r");
4223 coor+=fscanf_seq_name (fp, LS->name[current]);
4226 l=strlen ( LS->name[current]);
4227 if ( LS->name[current][l-1]==','||LS->name[current][l-1]==';')LS->name[current][l-1]='\0';
4228 LS->name[current]=translate_name ( LS->name[current]);
4230 while ((c=fgetc(fp))!='\n' && c!=EOF && a<(COMMENT_SIZE-1)){LS->seq_comment[current][a++]=c;coor++;}
4233 LS->seq_comment[current][a]='\0';
4236 while ((c=fgetc(fp))!='>' && c!=EOF)
4242 if (p==0)LS->dc[current][0]=coor;
4244 if (disk==0)LS->seq[current][p++]=c;
4248 LS->dc[current][1]=coor;
4252 if ( disk==0)LS->seq[current][p]='\0';
4254 if (LS->seq[current] && strm (LS->seq[current], "PDB"))
4257 pdb_name=get_pdb_struc(LS->name[current],0, 0);
4258 pdb_S=get_pdb_sequence (pdb_name);
4261 sprintf ( LS->seq[current], "%s", pdb_S->seq[0]);
4262 clen=strlen( pdb_S->seq[0]);
4263 free_sequence ( pdb_S, 1);
4267 add_warning (stderr, "WARNING: Could not fetch PDB file: %s", pdb_name);
4287 //LS=clean_sequence (LS);
4292 Sequence* get_sub_fasta_sequence (char *fname, char *comment_out)
4313 nseq=count_n_char_x_in_file(fname, '>');
4314 min_len_seq=max=count_n_char_in_file(fname);
4315 sub=vcalloc (max+1, sizeof (int));
4316 buf=vcalloc ( max+1, sizeof (char));
4317 fp=vfopen (fname,"r");
4325 fscanf_seq_name (fp,name);
4326 while ((c=fgetc(fp))!='\n' && c!=EOF);
4327 buf=fgets ( buf,max, fp);
4328 while ((c=fgetc(fp))!='>' && c!=EOF)
4329 if (isalnum (c)|| is_gap(c))
4331 max_len_seq=(clen> max_len_seq)?clen: max_len_seq;
4332 min_len_seq=(clen< min_len_seq)?clen: min_len_seq;
4341 LS=declare_sequence ( min_len_seq, max_len_seq,nseq);
4344 fp=vfopen (fname,"r");
4352 fscanf_seq_name (fp,LS->name[current]);
4353 l=strlen ( LS->name[current]);
4354 if ( LS->name[current][l-1]==','||LS->name[current][l-1]==';')LS->name[current][l-1]='\0';
4355 LS->name[current]=translate_name ( LS->name[current]);
4356 while ((c=fgetc(fp))!='\n' && c!=EOF);
4359 while ((c=fgetc(fp))!='>' && c!=EOF)
4362 LS->seq[current][p++]=tolower (c);
4364 LS->seq[current][p++]=(c);
4367 LS->seq[current][p]='\0';
4368 LS->len[current]=strlen ( LS->seq[current]);
4385 Sequence* get_pir_sequence (char *fname, char *comment_out)
4397 int min_len_seq=999999;
4401 buf=vcalloc ( 1000, sizeof (char));
4402 if ((fp=vfopen (fname,"r"))==NULL)
4403 {printf ( "\nCOULDN'T OPEN %s",fname);
4404 myexit(EXIT_FAILURE);
4411 if ( (c=fgetc(fp))=='P')while ( (c=fgetc(fp))!=';');
4412 else ungetc ( c, fp);
4413 fscanf_seq_name (fp,name);
4415 buf=fgets ( buf, 1000, fp);
4416 while ((c=fgetc(fp))!='>' && c!=EOF)
4417 if (isalnum (c)|| is_gap(c))
4419 max_len_seq=(clen> max_len_seq)?clen: max_len_seq;
4420 min_len_seq=(clen< min_len_seq)?clen: min_len_seq;
4431 LS=declare_sequence ( min_len_seq, max_len_seq,nseq);
4434 fp=vfopen (fname,"r");
4442 if ( (c=fgetc(fp))=='P')while ( (c=fgetc(fp))!=';');
4443 else ungetc ( c, fp);
4445 fscanf_seq_name (fp,LS->name[current]);
4447 l=strlen ( LS->name[current]);
4448 if ( LS->name[current][l-1]==','||LS->name[current][l-1]==',')LS->name[current][l-1]='\0';
4449 LS->name[current]=translate_name ( LS->name[current]);
4450 buf=fgets ( buf, 1000, fp);
4452 LS->seq_comment[current]=fgets ( LS->seq_comment[current],COMMENT_SIZE-1, fp);
4453 LS->seq_comment[current][strlen(LS->seq_comment[current])-1]='\0';
4455 while ((c=fgetc(fp))!='>' && c!=EOF)
4457 LS->seq[current][p++]=tolower (c);
4458 else if ( !isspace(c) && c!='*')
4459 LS->seq[current][p++]=(c);
4460 LS->seq[current][p]='\0';
4461 LS->len[current]=strlen ( LS->seq[current]);
4469 if (comment_out!=NULL) output_pir_check ( comment_out,LS->nseq, LS->seq_comment);
4473 Sequence* get_gor_sequence (char *fname, char *comment_out)
4485 int min_len_seq=99999;
4489 buf=vcalloc ( 1000, sizeof (char));
4490 if ((fp=vfopen (fname,"r"))==NULL)
4491 {printf ( "\nCOULDN'T OPEN %s",fname);
4492 myexit(EXIT_FAILURE);
4499 fscanf_seq_name (fp,name);
4501 buf=fgets ( buf, 1000, fp);
4502 while ((c=fgetc(fp))!='!' && c!=EOF)
4503 if (isalnum (c)|| is_gap(c))
4505 max_len_seq=(clen> max_len_seq)?clen: max_len_seq;
4506 min_len_seq=(clen< min_len_seq)?clen: min_len_seq;
4515 LS=declare_sequence ( min_len_seq, max_len_seq,nseq);
4518 fp=vfopen (fname,"r");
4528 fscanf_seq_name (fp,LS->name[current]);
4529 LS->name[current]=translate_name ( LS->name[current]);
4530 buf=fgets ( buf, 1000, fp);
4533 while ((c=fgetc(fp))!='!' && c!=EOF)
4534 if (isalnum (c)|| is_gap(c))
4535 LS->seq[current][p++]=tolower (c);
4537 LS->seq[current][p]='\0';
4538 LS->len[current]=strlen ( LS->seq[current]);
4549 Sequence* get_swissprot_sequence (char *fname, char *comment_out)
4556 int len, max_len_seq=0, min_len_seq=0;
4558 if ( !check_file_exists(fname))
4559 {printf ( "\nCOULDN'T OPEN %s",fname);
4560 myexit(EXIT_FAILURE);
4563 buf=vcalloc (LONG_STRING+1, sizeof (char));
4565 while ( (fp=find_token_in_file(fname,fp,"\nSQ")))
4568 fgets (buf, LONG_STRING, fp);
4570 while ((c=fgetc(fp))!='/')if(isalpha(c))len++;
4571 if ( max_len_seq==0)max_len_seq=min_len_seq=len;
4574 max_len_seq=MAX(len, max_len_seq);
4575 min_len_seq=MIN(len, min_len_seq);
4579 LS=declare_sequence ( min_len_seq, max_len_seq,nseq);
4583 while ( (fp=find_token_in_file(fname,fp,"\nID")))
4585 fscanf_seq_name (fp, LS->name[LS->nseq]);
4586 fp=find_token_in_file(fname,fp,"\nSQ");
4587 fgets (buf, LONG_STRING, fp);
4588 while ((c=fgetc(fp))!='/')if (isalpha(c))LS->seq[LS->nseq][LS->len[LS->nseq]++]=c;
4589 LS->seq[LS->nseq][LS->len[LS->nseq]]='\0';
4596 int fscanf_seq_name ( FILE *fp, char *sname)
4600 if ( !sname) return 0;
4602 if ( !name)name=vcalloc ( 10000, sizeof (char));
4603 fscanf (fp, "%s", name);
4605 if ( strlen (name)>MAXNAMES)
4606 add_warning (stderr, "\nWARNING: Seq Name Too long: [%s]. Truncated to %d", name, MAXNAMES);
4607 name[MAXNAMES]='\0';
4608 sprintf ( sname, "%s", name);
4612 /*******************************************************************************************/
4617 /***************************************************************************************** */
4618 void undump_msa ( Alignment *A, char *tmp)
4625 if ( !A || !tmp || !check_file_exists (tmp))return;
4626 m=measure_longest_line_in_file (tmp );
4627 A=realloc_aln2 ( A,A->max_n_seq,m+1);
4629 buf=vcalloc (m+1, sizeof (char));
4630 fp=vfopen (tmp, "r");
4631 while (fscanf (fp, "%d %s\n", &index, buf)==2)
4633 sprintf ( A->seq_al[index], "%s", buf);
4638 void dump_msa ( char *file,Alignment *A, int nseq, int *lseq)
4642 fp=vfopen (file, "w");
4643 for (a=0; a<nseq; a++)
4644 fprintf ( fp, "%d %s\n", lseq[a], A->seq_al[lseq[a]]);
4648 void read_aln (char *file_name, Alignment *A)
4654 tmp_name=vtmpnam (NULL);
4655 if (printf_system ( "clustalw_aln2fasta_aln.pl %s > %s",file_name, tmp_name)!=EXIT_SUCCESS)
4657 printf_exit ( EXIT_FAILURE, stderr, "Could Not Read File %s [FATAL:%s]\n", file_name, PROGRAM);
4661 S=get_fasta_sequence ( tmp_name,NULL);
4662 A=seq2aln (S, A, 0);
4666 void read_stockholm_aln (char *file_name, Alignment *A)
4672 tmp_name=vtmpnam (NULL);
4673 if (printf_system ( "clustalw_aln2fasta_aln.pl %s > %s",file_name, tmp_name)!=EXIT_SUCCESS)
4675 printf_exit ( EXIT_FAILURE, stderr, "Could Not Read File %s [FATAL:%s]\n", file_name, PROGRAM);
4680 S=get_fasta_sequence ( tmp_name,NULL);
4681 for (a=0; a<S->nseq; a++)
4683 if (strstr (S->name[a], "_stockholm"))
4685 substitute ( S->name[a], "_stockholmspace_", " ");
4686 substitute ( S->name[a], "_stockholmhasch_", "#");
4689 A=seq2aln (S, A, 0);
4693 Alignment* read_blast_aln ( char *file_name, Alignment *A)
4700 if ( !(type=is_blast_file (file_name)))
4702 myexit (EXIT_FAILURE);
4704 tmp_name=vtmpnam ( NULL);
4705 if (type==BLAST_TXT)
4707 printf_system("cat %s | blast_aln2fasta_aln.pl | fasta_aln2fasta_aln_unique_name.pl >%s", file_name, tmp_name);
4709 else if (type==BLAST_XML)
4712 printf_system("blast_xml2fasta_aln.pl %s >%s", file_name, tmp_name);
4715 main_read_aln (tmp_name, A);
4720 void read_number_aln ( char *file_name, Alignment *A)
4737 fp=vfopen ( file_name, "r");
4739 fname=vtmpnam(NULL);
4740 fp2=vfopen ( fname, "w");
4741 while ( (c=fgetc(fp))!=EOF)
4743 fprintf ( fp2, "%c", c);
4749 /*1 Count The number of sequences*/
4750 fp=vfopen ( fname, "r");
4751 buf=vfgets ( buf,fp);
4752 if ( !isblanc (buf));
4753 while ( isblanc (buf))
4755 buf=vfgets ( buf, fp);
4757 while (!isblanc (buf))
4759 buf=vfgets ( buf,fp);
4761 while ( !isalnum ((c=fgetc(fp))))
4764 buf=vfgets ( buf,fp);
4767 if ( c!='\n')ungetc(c,fp);
4769 while ( isalnum ((c=fgetc(fp))))
4773 while ( isgraph ((c=fgetc(fp))));
4775 buf=vfgets ( buf, fp);
4781 max_len=count_n_char_in_file(fname)/nseq;
4782 A=realloc_alignment2( A, nseq+1, max_len+1);
4787 fp=vfopen ( fname, "r");
4788 buf=vfgets ( buf, fp);
4789 if ( !isblanc (buf))sprintf (A->aln_comment[n_comment++], "%s", buf);
4790 while ( isblanc (buf))
4792 buf=vfgets ( buf,fp);
4794 while (!isblanc (buf))
4796 buf=vfgets ( buf, fp);
4797 sprintf ( A->aln_comment[n_comment++], "%s", buf);
4800 while ( !isalnum ((c=fgetc(fp))))
4803 buf=vfgets ( buf, fp);
4807 if ( c!='\n')ungetc(c,fp);
4809 while ( isalnum ((c=fgetc(fp))))
4813 fscanf_seq_name (fp, A->name[A->nseq]);
4815 if ( name_is_in_list (A->name[A->nseq], A->name, A->nseq, 100)!=-1)
4817 fprintf ( stderr, "\nWARNING (read_number_aln): Sequence %s Duplicated in File %s ", A->name[A->nseq], A->file[A->nseq]);
4818 if (!getenv("ALLOW_DUPLICATE"))
4820 fprintf ( stderr, " [FATAL:%s]\n", PROGRAM);
4821 myexit (EXIT_FAILURE);
4825 buf=vfgets ( buf,fp);
4832 if ((fp=vfopen ( fname, "r"))==NULL)
4833 printf ( "\nCOULDN'T READ %s", fname);
4835 ptr_aln=vcalloc ( A->nseq, sizeof(int));
4838 while ( (c=fgetc(fp))!='\n');
4839 if ( (c=fgetc(fp))=='\n')
4842 while ( !isalnum(c=fgetc(fp)));
4847 while(tot< A->nseq && c!=EOF)
4850 while ( !isgraph (c=fgetc(fp)) && c!=EOF);
4851 if ( c!=EOF)ungetc(c, fp);
4852 while ( isgraph((buf[b++]=fgetc(fp))));
4854 for ( a=-1,d=0; d< A->nseq; d++)
4855 if ( strcmp (A->name[d], buf)==0)
4860 if ( a==-1) while ( (c=fgetc(fp))!='\n' && c!=EOF);
4863 while ( (c=fgetc(fp))!='\n')
4865 if ( isgraph(c) || is_gap(c))
4867 c=(A->residue_case==2)?c:tolower(c);
4869 if (!isspace(c))A->seq_al[a][ptr_aln[a]++]=c;
4874 while ( !isalnum(c=getc(fp)) && c!=EOF);
4882 for ( a=0; a< A->nseq; a++)
4883 {A->seq_al[a][ptr_aln[a]]='\0';
4888 A->len_aln= strlen(A->seq_al[0]);
4895 void read_amps_aln ( char *in_file, Alignment *A)
4898 int a, b, c, cont=1;
4899 A->nseq=get_amps_seq_name ( A->name, in_file);
4901 fp=vfopen ( in_file, "r");
4902 fp=set_fp_id(fp, "1*");
4903 while ( (c=fgetc(fp))!='\n');
4912 for ( a=0; a<A->nseq; a++)
4913 A->seq_al[a][b]='\0';
4920 for ( a=0; a< A->nseq; a++)
4923 if ( c==' ')A->seq_al[a][b]='-';
4930 while ((c=fgetc(fp))!='\n');
4941 int get_amps_seq_name ( char **name, char* fname)
4946 fp=vfopen ( fname, "r");
4947 fp=set_fp_id ( fp, "Index");
4948 while ( (fgetc(fp))!='\n');
4949 while ( isspace(fgetc(fp)))
4950 {fscanf (fp, "%*d >%s", name[nseq++]);
4951 while ( (fgetc(fp))!='\n');
4956 Alignment * read_gotoh_aln ( char *fname, Alignment *A)
4964 char buf2[VERY_LONG_STRING+1];
4965 char buf3[VERY_LONG_STRING+1];
4966 char buf4[VERY_LONG_STRING+1];
4974 if ( !check_file_exists (fname))return NULL;
4975 fp=vfopen ( fname, "r");
4977 /*1 GET THE NUMBER OF SEQUENCES*/
4979 buf=vcalloc ( VERY_LONG_STRING+1, sizeof (char));
4980 while ( isblanc (buf=fgets ( buf, VERY_LONG_STRING, fp)));
4981 while (!isblanc (buf=fgets ( buf, VERY_LONG_STRING, fp)));
4982 while ( isblanc (buf=fgets ( buf, VERY_LONG_STRING, fp)));
4983 while ( !isblanc ( buf) && buf!=NULL)
4986 d=sscanf ( buf, "%d %s %s %s", &a, buf2, A->name[A->nseq],buf3);
4989 if ( name_is_in_list (A->name[A->nseq], A->name, A->nseq, 100)!=-1)
4991 fprintf ( stderr, "\nWARNING (get_amps_seq_name): Sequence %s Duplicated in File %s ", A->name[A->nseq], A->file[A->nseq]);
4992 if (!getenv("ALLOW_DUPLICATE"))
4994 fprintf ( stderr, " [FATAL:%s]\n", PROGRAM);
4995 myexit (EXIT_FAILURE);
4999 fgets(buf, VERY_LONG_STRING, fp);
5004 /*2 Get the MAX Len and Reallocate*/
5005 max_len=count_n_char_in_file(fname)/nseq;
5006 A=realloc_aln2( A, nseq+1, max_len+1);
5007 /*3 Get The Sequences Names*/
5009 fp=vfopen ( fname, "r");
5010 while ( isblanc (buf=fgets ( buf, VERY_LONG_STRING, fp)));
5011 while (!isblanc (buf=fgets ( buf, VERY_LONG_STRING, fp)));
5012 while ( isblanc (buf=fgets ( buf, VERY_LONG_STRING, fp)));
5013 while ( !isblanc ( buf) && buf!=NULL)
5016 d=sscanf ( buf, "%d %s %s %s", &a, buf2, A->name[A->nseq],buf3);
5019 if ( d==4)sprintf (A->name[A->nseq],"%s", buf3);
5021 fgets(buf, VERY_LONG_STRING, fp);
5028 fp=vfopen ( fname, "r");
5030 buf=vcalloc ( VERY_LONG_STRING+1, sizeof (char));;
5031 ptr_aln=vcalloc ( A->nseq, sizeof(int));
5033 while ( isblanc (buf=fgets ( buf, VERY_LONG_STRING, fp)));
5034 while (!isblanc (buf=fgets ( buf, VERY_LONG_STRING, fp)));
5037 while ( isblanc (buf=fgets ( buf, VERY_LONG_STRING, fp)));
5045 e=sscanf (buf, "%d %s %s %s", &e, buf2, buf3, buf4);
5046 if ( e==4)sprintf( buf3, "%s", buf4);
5049 for ( d=0; d< A->nseq; d++)
5052 if ( strcmp (A->name[d], buf3)==0)
5058 if ( buf2[l-1]=='|')l--;
5063 if ( isgraph (buf2[b]))
5064 A->seq_al[a][ptr_aln[a]++]=(A->residue_case==2)?buf2[b]:tolower (buf2[b]);
5066 buf=fgets(buf, VERY_LONG_STRING, fp);
5070 buf=fgets(buf, VERY_LONG_STRING, fp);
5071 while ( isblanc (buf) && buf!=NULL)
5073 buf=fgets ( buf, VERY_LONG_STRING, fp);
5082 for ( a=0; a< A->nseq; a++)
5083 {A->seq_al[a][ptr_aln[a]]='\0';
5086 A->len_aln= strlen(A->seq_al[0]);
5090 for ( a=0; a< A->nseq; a++)
5092 for ( b=0; b< A->len_aln; b++)
5093 A->len[a]+=1-is_gap(A->seq_al[a][b]);
5095 for ( a=0, b=0; a< A->len_aln; a++)
5097 if ( !is_gap(A->seq_al[0][a]) &&!is_gap(A->seq_al[1][a]))b++;
5106 void read_msf_aln ( char *fname, Alignment *A)
5112 tmp_name=vtmpnam(NULL);
5113 sprintf ( command, "msf_aln2fasta_aln.pl %s > %s", fname, tmp_name);
5115 if ( my_system (command)!=EXIT_SUCCESS)
5117 fprintf ( stderr, "\nERROR: file %s does not have a legal msf format [FATAL:%s]", fname,PROGRAM);
5118 myexit (EXIT_FAILURE);
5121 S=get_fasta_sequence ( tmp_name,NULL);
5122 A=seq2aln (S, A, 0);
5127 /**************************************************************************************************/
5128 /*************************************REFORMAT OUT*************************************************/
5129 /**************************************************************************************************/
5130 /*******************************************************************************************/
5133 /* OUTPUT MATRICES */
5135 /***************************************************************************************** */
5139 int output_freq_mat ( char *outfile, Alignment *A)
5141 function documentation: start
5143 int output_freq_mat ( char *outfile, Aligmnent *A)
5145 This function counts the number of residues in each column of an alignment (Prot)
5146 It outputs these values in the following format
5152 This format can be piped into:
5153 The routine used for computing the p-value gmat-inf-gc-v2c
5155 function documentation: end
5163 freq_mat=aln2count_mat (A);
5165 fp=vfopen ( outfile, "w");
5166 for ( b=0; b< 26; b++)
5168 fprintf (fp, "%c |", 'A'+b);
5169 for ( a=0; a< A->len_aln; a++)fprintf (fp,"%d ", freq_mat[b][a]);
5172 fprintf (fp, "- |");
5173 for ( a=0; a< A->len_aln; a++)fprintf (fp,"%d ", freq_mat[26][a]);
5175 free_int (freq_mat, -1);
5179 /*******************************************************************************************/
5182 /* OUTPUT P-Values */
5184 /***************************************************************************************** */
5185 float output_maln_pval ( char *outfile, Alignment *A)
5188 function documentation: start
5189 float output_maln_pval ( char *outfile, Aligmnent *A)
5191 This function outputs the p-value of a multiple alignmnet as described
5192 in Hertz, Stormo, Bioinformatics, 15-7/8, 563/577
5193 ftp beagle.colorado.edu /pub/cosensus
5195 packages/consensus/gmat-inf-gc-v2c
5198 The routine used for computing the p-value is the program gmat-inf-gc-v2c
5199 function documentation: end
5207 char command[LONG_STRING];
5208 char string[STRING];
5210 result=vtmpnam (NULL);
5212 output_freq_mat (mat,A);
5213 sprintf ( command, "more %s | gmat-inf-gc-v2c -A abcdefghijklmnopqrstuvwxyz> %s",mat, result);
5214 my_system ( command);
5216 if ( !check_file_exists(result))return 0;
5217 fp=find_token_in_file ( result, NULL, "ln(p-value):");
5219 fscanf ( fp, "%s",string);
5220 value=atof ( string);
5226 fp=vfopen ( outfile, "w");
5227 fprintf ( fp, "%.6f\n", value);
5234 /*******************************************************************************************/
5237 /* OUTPUT WEIGHTS */
5239 /***************************************************************************************** */
5240 int output_seq_weights ( Weights *W, char *wfile)
5245 if ( W==NULL)return 0;
5247 fp=vfopen (wfile, "w");
5248 if ( fp==NULL)return 0;
5251 for ( a=0; a< W->nseq; a++)
5254 fprintf ( fp, "%s %.2f\n", W->seq_name[a],W->SEQ_W[a]);
5259 void output_pw_weights4saga ( Weights *W, float **w_list, char *wfile)
5263 fp=vfopen (wfile, "w");
5265 fprintf ( fp, "%s\n$\n", W->comments);
5266 for ( a=0; a< W->nseq-1; a++)
5268 for (b=a+1; b< W->nseq; b++)
5270 fprintf ( fp, "%s %s %f\n", W->seq_name[a], W->seq_name[b],w_list[a][b]);
5273 fprintf ( fp, "$\n");
5277 FILE * display_weights (Weights *W, FILE *fp)
5284 fprintf ( fp, "\n\nUN-WEIGHTED MODE: EVERY SEQUENCE WEIGHTS 1\n");
5287 fprintf ( fp, "\n\nWEIGHTED MODE:%s\n\n", (W)->mode);
5288 for ( a=0, max_len=0; a< W->nseq; a++)max_len=MAX(max_len, strlen (W->seq_name[a]));
5289 for ( a=0; a< (W->nseq); a++)
5291 fprintf ( fp, "\t%*s %.2f\n", max_len,(W)->seq_name[a],W->SEQ_W[a]);
5293 fprintf ( fp, "\n");
5297 /*******************************************************************************************/
5302 /***************************************************************************************** */
5303 int ** input_similarities (char *file, Alignment *A, char *mode)
5310 char *buf1=NULL, *buf2=NULL;
5315 if ( !check_file_exists (file) || !is_distance_matrix_file (file) ||!is_similarity_matrix_file (file) )
5322 fp=vfopen (file, "r");
5323 while ((buf2=vfgets (buf1,fp))!=NULL )
5325 if (strstr (buf2, "SEQ_INDEX"))
5328 sscanf (buf1, "# SEQ_INDEX %s %d",name, &i);
5329 if ( !strm (A->name[i], name))
5340 A=similarities_file2aln(file);
5344 sim=declare_int ( A->nseq, A->nseq);
5345 for ( a=0; a<A->nseq; a++)sim[a][a]=100;
5348 fp=find_token_in_file (file, NULL, "PW_SEQ_DISTANCES");
5349 fp=find_token_in_file (file, fp, "BOT");
5350 while ((buf2=vfgets (buf1,fp))!=NULL )
5352 if ( !(strstr (buf2, "BOT\t") || strstr (buf2, "TOP\t")))continue;
5354 n=sscanf (buf1, "%*s %d %d %f", &a, &b, &score);
5360 else sim[a][b]=sim[b][a]=(int)score;
5364 if (new_aln)free_aln(A);
5368 Alignment * similarities_file2aln ( char *file)
5376 fp=vfopen (file, "r");
5377 while ((fp=find_token_in_file (file,fp, "SEQ_INDEX")))nseq++;
5378 A=declare_aln2 (nseq+1, 10);
5380 while ((fp=find_token_in_file (file,fp, "SEQ_INDEX")))
5382 fscanf (fp, "%s %d", name,&i);
5383 sprintf ( A->name[i], "%s", name);
5390 void output_similarities (char *file, Alignment *A, char *mode)
5400 for (max=0, a=0; a< A->nseq; a++)max=MAX(max,(strlen (A->name[a])));
5403 tot=vcalloc ( A->nseq, sizeof (float));
5404 fp=vfopen (file, "w");
5405 fprintf (fp, "# TC_SIMILARITY_MATRIX_FORMAT_01\n");
5406 for ( a=0; a<A->nseq; a++)
5407 fprintf ( fp, "# SEQ_INDEX %s %d\n",A->name[a],a);
5408 fprintf ( fp, "# PW_SEQ_DISTANCES \n");
5409 for (n=0,a=0;a< A->nseq-1; a++)
5411 for ( b=a+1; b<A->nseq; b++, n++)
5413 if (strstr (mode, "_sarmat2"))
5415 s=get_sar_sim (A->seq_al[a], A->seq_al[b]);
5417 else if (strstr (mode, "_sar"))
5419 s=get_sar_sim (A->seq_al[a], A->seq_al[b]);
5421 else if ( (p=strstr (mode, "_memory_")))
5424 sscanf ( p, "_memory_%ld", (long int*)&sim);
5427 else if ( strstr (mode, "_idscore") || strstr ( mode, "_covscore"))
5432 free_sequence (S, -1);
5433 if ( strstr (mode, "idscoreDNA"))
5434 M=read_matrice ("idmat");
5436 M=read_matrice("blosum62mt");
5440 if ( strstr (mode, "_idscore"))s=idscore_pairseq(S->seq[a], S->seq[b], -10,-1, M, "sim");
5441 else s=idscore_pairseq(S->seq[a], S->seq[b], -10,-1, M, "cov");
5443 else if ( strstr (mode, "cov"))
5445 s=get_seq_sim ( A->seq_al[a], A->seq_al[b],GAP_LIST, "cov");
5449 s=get_seq_fsim2 (A->seq_al[a], A->seq_al[b],GAP_LIST, mode);
5451 fprintf (fp, "BOT\t %4d %4d\t %5.2f %*s\t %*s\t %5.2f\n", a,b,s,max,A->name[a], max, A->name[b], s);
5452 fprintf (fp, "TOP\t %4d %4d\t %5.2f %*s\t %*s\t %5.2f\n", b,a,s,max,A->name[b], max, A->name[a], s);
5458 for ( a=0; a< A->nseq; a++)
5460 fprintf (fp, "AVG\t %d\t %*s\t %*s\t %5.2f\n", a,max,A->name[a], max, "*", tot[a]/(A->nseq-1));
5463 vfree (tot);free_int (M, -1);
5464 fprintf (fp, "TOT\t %*s\t %*s\t %5.2f\n", max,"TOT", max, "*", bigtot/n);
5468 void output_similarities_pw (char *file, Alignment *A, Alignment *B,char *mode)
5480 if ( strstr (mode, "idscoreDNA"))
5481 M=read_matrice ("idmat");
5483 M=read_matrice("blosum62mt");
5488 for (max=0, a=0; a< A->nseq; a++)max=MAX(max,(strlen (A->name[a])));
5489 for (a=0; a< B->nseq; a++)max=MAX(max,(strlen (B->name[a])));
5492 tot=vcalloc ( A->nseq, sizeof (float));
5493 fp=vfopen (file, "w");
5494 fprintf (fp, "# TC_SIMILARITY_MATRIX_FORMAT_01\n");
5495 for ( a=0; a<A->nseq; a++)
5496 fprintf ( fp, "# SEQ_INDEX %s %d\n",A->name[a],a);
5497 fprintf ( fp, "# PW_SEQ_DISTANCES \n");
5498 for (n=0,a=0;a< A->nseq; a++)
5500 for ( b=0; b<B->nseq; b++, n++)
5502 s=idscore_pairseq(SA->seq[a], SB->seq[b], -10,-1, M, "sim");
5503 fprintf (fp, "BOT\t %4d %4d\t %5.2f %*s\t %*s\t %5.2f\n", a,b,s,max,A->name[a], max, B->name[b], s);
5504 fprintf (fp, "TOP\t %4d %4d\t %5.2f %*s\t %*s\t %5.2f\n", b,a,s,max,B->name[b], max, A->name[a], s);
5511 for ( a=0; a< A->nseq; a++)
5513 fprintf (fp, "AVG\t %d\t %*s\t %*s\t %5.2f\n", a,max,A->name[a], max, "*", tot[a]/(A->nseq-1));
5515 vfree (tot);free_int (M, -1);
5516 fprintf (fp, "TOT\t %*s\t %*s\t %5.2f\n", max,"TOT", max, "*", bigtot/n);
5519 void output_conservation_statistics ( char *file, Alignment *A)
5526 sprintf (aa, "%s", BLAST_AA_ALPHABET);
5529 tot=declare_double (256, 256);
5532 for ( a=0; a<A->nseq; a+=2)
5535 for ( c=0; c<A->len_aln; c++)
5537 c1=tolower (A->seq_al[a][c]);
5538 c2=tolower (A->seq_al[b][c]);
5539 if ( !is_gap(c1) && !is_gap(c2))
5550 fprintf ( stdout, "# BLAST_MATRIX FORMAT\n#ALPHABET=%s\n",aa);
5551 for (a=0; a<naa; a++)fprintf ( stdout, "%3c ", toupper(aa[a]));
5552 fprintf ( stdout, "\n");
5553 for (a=0; a< naa; a++)
5555 fprintf (stdout, "%c", toupper(aa[a]));
5556 for ( b=0; b< naa; b++)
5558 float f1, f2, f3, r, v;
5559 c1=tolower(aa[a]);c2=tolower(aa[b]);
5560 f1=(float)((tot[c1][c2]*2)/tot[0][0]);
5561 f2=(float)((tot[c1][0])/tot[0][0]);
5562 f3=(float)((tot[c2][0])/tot[0][0]);
5563 r=(float)(f2==0 || f3==0)?0:(f1/(f2*f3));
5564 v=(r==0)?0:((float)10*log((double)r));
5565 fprintf (stdout, " %5d",(int)v);
5567 fprintf ( stdout, "\n");
5570 void output_statistics (char *file, Alignment *A, char *mode)
5573 int a, b, c, d=0, n;
5577 if (!mode || !mode[0])
5579 else if ( mode[0]=='_')
5581 for ( a=0; a<A->nseq; a++)maxname=MAX(strlen(A->name[a]), maxname);
5585 fp=vfopen (file, "w");
5590 while ((c=mode[b++])!='\0')
5592 if ( c=='n') fprintf (fp, "%-*s ",maxname,"name");
5593 if ( c=='l') fprintf (fp, "%-*s ",5,"nres");
5594 if ( c=='g') fprintf (fp, "%-*s ",5,"ngap");
5595 if ( c=='t') fprintf (fp, "%-*s ",5,"len");
5597 if (is_in_set ( c, "nlgt")) fprintf (fp, "\n");
5601 while ((c=mode[b++])!='\0')
5604 if ( c=='N'){d=1;fprintf (fp, "NSEQ %d ", A->nseq);}
5605 if ( c=='L'){d=1;fprintf (fp, "LEN %d ", A->len_aln);}
5607 if ( d) fprintf (fp, "\n");
5609 for (a=0; a<A->nseq; a++)
5613 while ((c=mode[b++])!='\0')
5615 if (is_in_set ( c, "nlgt"))d=1;
5617 if (c=='n'){d=1;fprintf ( fp, "%-*s ", maxname,A->name[a]);}
5620 for (n=0,d=0; d<A->len_aln; d++)n+=!is_gap(A->seq_al[a][d]);
5621 fprintf ( fp, "%-5d ",n);
5625 for (n=0,d=0; d<A->len_aln; d++)n+=((is_gap(A->seq_al[a][d]) && !is_gap(A->seq_al[a][d+1]))||(is_gap(A->seq_al[a][d])&& A->seq_al[a][d+1]=='\0')) ;
5626 fprintf ( fp, "%-5d ",n);
5630 fprintf ( fp, "%-5d ",strlen (A->seq_al[a]));
5634 fprintf ( fp, "%-5d ",A->nseq);
5638 fprintf ( fp, "%-5d ",A->len_aln);
5641 if (d)fprintf ( fp, "\n");
5646 int output_age_matrix ( char *outfile, int val)
5650 char alp[]="abcdefghij-";
5653 mat=declare_int ( 256, 256);
5655 for ( a=0; a<naa; a++)
5656 for ( b=0; b<naa; b++)
5658 if (is_gap(alp[a]) ||is_gap(alp[b] ))mat[(int)alp[a]][(int)alp[b]]=((val==0)?1:val)*-1;
5659 else mat[(int)alp[a]][(int)alp[b]]=(FABS((a-b))*-1)*((val==0)?1:val);
5662 output_mat ( mat,outfile, alp, 0);
5663 free_arrayN((void**)mat, 2);
5670 int output_transitions(char *outfile, Alignment *A)
5672 double table[256][256];
5673 double symbols[256];
5674 double tot, l, freq, expected, log_odd;
5684 for ( a=0; a< 256; a++)
5685 for (b=0; b<256; b++)
5690 alp=vcalloc ( 256, sizeof (char));
5691 mat=declare_int ( 256,256);
5692 fmat=declare_float ( 256,256);
5694 for (tot=0,a=0; a< A->nseq; a++)
5696 ungap (A->seq_al[a]);
5697 lower_string (A->seq_al[a]);
5700 if ( s[0]=='\0') continue;
5701 symbols[(int)s[0]]++;
5702 for ( b=1; b< l; b++)
5704 symbols[(int)s[b]]++;
5705 table[(int)s[b-1]][(int)s[b]]++;
5709 for (naa=0, a=0; a< 256; a++)
5711 if (symbols[a])alp[naa++]=a;
5715 for ( a=0; a< 256; a++)
5716 for (b=0; b<256; b++)
5718 if (symbols[a]&& symbols[b] && table[a][b] && tot>0)
5720 freq=(table[a][b])/tot;
5721 expected=(symbols[a]*symbols[b])/(tot*tot);
5722 log_odd=log (freq/expected);
5723 mat[a-'A'][b-'A']=log_odd*10;
5724 fmat[a-'A'][b-'A']=log_odd;
5726 else if ( symbols[a]&& symbols[b])
5728 mat[a-'A'][b-'A']=-999;
5729 fmat[a-'A'][b-'A']=-999;
5732 output_mat ( mat,outfile, alp, 'A');
5734 fp=vfopen (outfile, "a");
5735 for ( a=0; a<256; a++)
5738 fprintf (fp, "# %c tot: %6d freq: %7.5f\n", a, (int)symbols[a],(float)symbols[a]/tot);
5741 for ( a=0; a< 256; a++)
5742 for (b=0; b<256; b++)
5744 if (symbols[a]&& symbols[b])
5746 freq=(table[a][b])/tot;
5747 fprintf (fp, "# %c%c tot: %6d freq: %7.5f log_odd: %9.3f\n", a, b, (int)table[a][b],(float)freq,fmat[a-'A'][b-'A']);
5752 free_arrayN ((void **)mat, 2);
5753 free_arrayN ((void **)fmat, 2);
5760 void output_est_prf (char *fname, Alignment *A)
5767 fprintf ( stderr, "\nFormat output_est_prf Impossible: No profile\n");
5768 myexit(EXIT_FAILURE);
5772 fp=vfopen ( fname, "w");
5773 fprintf ( fp, "Consensus Sequence\nReconstructed with %s (%s,%s)\n",PROGRAM,AUTHOR,DATE);
5774 fprintf ( fp, "%4c %4c %4c %4c %15s Consensus\n", 'A','G','C','T', "Internal Gaps");
5776 for ( a=0; a< A->len_aln; a++)
5778 fprintf (fp, "%4d %4d %4d %4d %15d %c\n", (A->P)->count[0][a],(A->P)->count[1][a],(A->P)->count[2][a], (A->P)->count[3][a], (A->P)->count[4][a],A->seq_al[0][a]);
5784 void output_gotoh_seq (char *fname, Alignment*A )
5789 fp=vfopen ( fname, "w");
5790 fprintf ( fp, "%d %d\n",A->nseq, A->max_len);
5791 for ( a=0; a< A->nseq; a++)
5793 ungap ( A->seq_al[a]);
5794 fprintf ( fp, ">%s\n", A->name[a]);
5795 fp=output_string_wrap ( 50,A->seq_al[a] , fp);
5796 fprintf ( fp, "//\n");
5802 void output_mult_fasta_seq (char *fname, Alignment*A, int n )
5807 fp=vfopen (fname, "w");
5808 ungap(A->seq_al[0]);
5811 fprintf (fp, ">%s_%d\n%s\n", A->name[0],a+1, A->seq_al[0]);
5816 char * output_fasta_seqX (char *name, char *mode, Sequence *S, Alignment *A, int i)
5820 if (!name)name=vtmpnam (NULL);
5821 fp=vfopen (name, mode);
5822 if ( (S && S->nseq<=i) || (A && S->nseq<=i) || (!A && !S))
5824 fprintf ( stderr, "\nERROR in function reformat:output_fasta_seqX[FATAL:%s]", PROGRAM);
5825 myexit (EXIT_FAILURE);
5829 fprintf ( fp, ">%s %s\n%s\n", S->name[i], S->seq_comment[i], S->seq[i]);
5832 ungap (A->seq_al[i]);
5833 fprintf ( fp, ">%s %s\n%s\n", A->name[i], A->seq_comment[i], A->seq_al[i]);
5839 void output_fasta_seq1 (char *fname, Alignment*A )
5841 char seq_name[VERY_LONG_STRING];
5846 for ( a=0; a< A->nseq; a++)
5848 if ( strncmp( fname, "name",4)==0)
5850 if ( (fname+4)[0]!='\0')extension=fname+5;
5854 sprintf ( seq_name,"%s.%s", A->name[a],(extension==NULL)?"seq":extension);
5857 sprintf ( seq_name,"%s.seq",A->name[a]);
5859 ungap ( A->seq_al[a]);
5860 fp=vfopen (seq_name, "w");
5861 fprintf (fp, ">%s %s\n", A->name[a], A->seq_comment[a]);
5862 fp=output_string_wrap ( 50, A->seq_al[a],fp);
5863 fprintf ( fp, "\n");
5867 void output_pir_check (char *fname,int nseq, char **comment )
5872 if ( fname==NULL)return;
5873 fp=vfopen ( fname, "w");
5875 for ( a=0; a< nseq; a++)fprintf (fp, "%s\n", comment[a]);
5878 void output_fasta_seq (char *fname, Alignment*A)
5880 main_output_fasta_seq (fname, A, HEADER);
5882 void output_fasta_tree (char *fname, Alignment*A)
5886 if ( !A || !A->nseq) return;
5888 fp=vfopen ( fname, "w");
5890 for ( a=0; a< A->nseq; a++)
5892 fprintf ( fp, ">%s %s\n%s\n", A->name[a], A->seq_comment[a], A->seq_al[a]);
5896 void main_output_fasta_seq (char *fname, Alignment*A,int header )
5901 fp=vfopen ( fname, "w");
5903 for ( a=0; a< A->nseq; a++)
5905 ungap(A->seq_al[a]);
5906 fprintf ( fp, ">%s", A->name[a]);
5907 if (header==HEADER && A->seq_comment[a][0] && !isblanc(A->seq_comment[a]))fprintf (fp," %s\n",A->seq_comment[a]);
5908 else fprintf ( fp, "\n");
5909 fp=output_string_wrap ( 50, A->seq_al[a],fp);
5910 fprintf ( fp, "\n");
5914 void output_gor_seq (char *fname, Alignment*A )
5919 fp=vfopen ( fname, "w");
5921 for ( a=0; a< A->nseq; a++)
5923 ungap(A->seq_al[a]);
5924 fprintf ( fp, "!%s %d \n", A->name[a], (int)strlen(A->seq_al[a]));
5925 upper_string ( A->seq_al[a]);
5926 fp=output_string_wrap ( 50, A->seq_al[a],fp);
5927 fprintf ( fp, "@\n");
5931 void output_pir_seq (char *fname, Alignment*A )
5934 for ( a=0; a< A->nseq; a++)ungap(A->seq_al[a]);
5935 output_pir_aln (fname, A);
5937 void output_pir_seq1 (char *fname, Alignment*A )
5939 char seq_name[VERY_LONG_STRING];
5945 for ( a=0; a< A->nseq; a++)
5947 if ( strm ( get_string_type (A->seq_al[a]),"DNA") || strm ( get_string_type (A->seq_al[a]),"RNA"))sprintf(type, "DL");
5948 else if ( strm ( get_string_type (A->seq_al[a]),"PROTEIN"))sprintf(type, "P1");
5949 sprintf ( seq_name,"%s;%s_%s.seq",type, fname,A->name[a]);
5950 ungap ( A->seq_al[a]);
5951 fp=vfopen (seq_name, "w");
5952 fprintf (fp, ">%s\n\n", A->name[a]);
5953 fp=output_string_wrap ( 50, A->seq_al[a],fp);
5954 fprintf ( fp, "\n*\n");
5958 /*******************************************************************************************/
5963 /***************************************************************************************** */
5964 void output_mocca_aln (char *outfile, Alignment *A, Alignment *S)
5968 char **new_name_order;
5971 score=declare_int (S->nseq, 2);
5972 new_name_order=declare_char ( S->nseq,MAXNAMES+1);
5973 for ( a=0; a<A->nseq; a++)
5976 score[a][1]=S->score_seq[a];
5978 sort_int_inv (score+1,2,1,0,S->nseq-2);
5979 for ( a=0; a<A->nseq; a++)
5981 sprintf ( new_name_order[a], "%s", A->name[score[a][0]]);
5983 A=reorder_aln (A, new_name_order, A->nseq);
5985 fp=vfopen (outfile, "w");
5986 fprintf ( fp, "MOCCA,(%s,%s, C. Notredame)\nSCORE %d\nNSEQ %d\nLEN %d\n",VERSION,DATE, A->score_aln, A->nseq, A->len_aln);
5988 maxl=return_maxlen ( new_name_order, A->nseq);
5991 for (a=0; a< A->nseq; a++)
5993 fprintf (fp, "%-*s: %3d\n", maxl, A->name[a], score[a][1]);
5996 fprintf ( fp, "\n");
5998 fp=output_Alignment_without_header ( A, fp);
6000 free_int (score, -1);
6001 free_char (new_name_order, -1);
6005 void print_sub_aln ( Alignment *B, int *ns, int **ls)
6011 X=copy_aln (B, NULL);
6013 X->len_aln=strlen ( B->seq_al[ls[0][0]]);
6016 for (a=0; a< 2; a++)
6017 for ( b=0; b<ns[a]; b++, X->nseq++)
6019 sprintf ( X->seq_al[X->nseq], "%s", B->seq_al[ls[a][b]]);
6020 sprintf ( X->name[X->nseq], "%s", B->name[ls[a][b]]);
6022 X->name[X->nseq][0]='\0';
6027 void print_aln ( Alignment *B)
6032 output_Alignment_without_header ( B, stderr);
6038 FILE * output_aln ( Alignment *B, FILE *fp){return output_Alignment(B, fp);}
6039 FILE * output_Alignment ( Alignment *B, FILE *fp)
6041 fprintf ( fp, "%s, %s (%s) [%s] [MODE: %s]\n%s\nCPU %d sec\nSCORE %d\nNSEQ %d\nLEN %d\n",PROGRAM,VERSION,DATE,retrieve_mode(),URL,AUTHOR, (B->cpu+get_time())/1000, B->score_aln, B->nseq, B->len_aln);
6042 return output_Alignment_without_header ( B, fp);
6045 FILE * output_Alignment_without_header ( Alignment *B, FILE *fp)
6054 if (fp==NULL)return fp;
6055 for ( a=0; a< B->nseq; a++)
6056 {if ( strlen (B->name[a])>max_len)
6057 max_len= strlen ( (B->name[a]));
6059 max_len=MAX(max_len+2, 16);
6060 line=get_msa_line_length (0, 0);
6061 n_residues=vcalloc ( B->nseq+1, sizeof (int));
6062 for ( a=0; a<B->nseq; a++)n_residues[a]=(B->output_res_num==2)?B->order[a][1]:0;
6067 fprintf ( fp, "\n");
6068 for (a=0; a<B->len_aln; a+=line)
6069 {for (b=0; b<=B->nseq; b++)
6071 fprintf (fp,"%-*s",max_len,B->name[b]);
6072 if (B->output_res_num)fprintf (fp, " %4d ", n_residues[b]+1);
6073 for (c=a;c<a+line && c<B->len_aln;c++)
6075 if (b==B->nseq){n_residues[b]++;s=analyse_aln_column ( B, c);}
6077 {n_residues[b]+=!is_gap(B->seq_al[b][c]);
6078 s=GET_CASE(B->residue_case, B->seq_al[b][c]);
6081 fprintf (fp,"%c",s );
6083 if (B->output_res_num)fprintf (fp, " %4d", n_residues[b]);
6090 fprintf (fp,"\n\n");
6095 FILE * output_aln_score ( Alignment *B, FILE *fp){return output_Alignment_score(B, fp);}
6096 FILE * output_Alignment_score ( Alignment *B, FILE *fp)
6099 static int max_len=0;
6103 if (fp==NULL)return fp;
6106 for ( a=0; a< B->nseq; a++)
6107 {if ( strlen (B->name[a])>max_len)
6108 max_len= strlen ( (B->name[a]));
6113 line=get_msa_line_length(0, 0);
6114 sprintf (B->name[B->nseq], "CONS");
6115 fprintf ( fp, "T_COFFEE ALIGNMENT\nCPU TIME:%d sec.\n", (B->cpu+get_time())/1000);
6116 fprintf ( fp, "SCORE=%d\n", B->score_aln);
6117 for ( a=0;a<B->nseq; a++)fprintf ( fp, "%s: %d\n", B->name[a], B->score_seq[a]);
6118 fprintf ( fp, "\n");
6119 for (a=0; a<B->len_aln; a+=line)
6120 {for (b=0; b<B->nseq; b++)
6122 fprintf (fp,"%-*s",max_len,B->name[b]);
6123 for (c=a;c<a+line && c<B->len_aln;c++)
6126 if (ch==NO_COLOR_RESIDUE)fprintf (fp,"-");
6127 else if ( ch==NO_COLOR_GAP)fprintf (fp,"*");
6128 else if ( ch<10 && ch>=0)fprintf (fp,"%d",ch);
6129 else if ( ch>10)fprintf (fp,"#");
6130 else if ( ch<0)fprintf (fp,".");
6131 else fprintf (fp,"9");
6136 fprintf (fp,"%-*s",max_len,B->name[b]);
6137 for (c=a;c<a+line && c<B->len_aln;c++)
6140 if (ch==NO_COLOR_RESIDUE)fprintf (fp,"-");
6141 else if ( ch==NO_COLOR_GAP)fprintf ( fp, "*");
6142 else if ( ch<10 && ch>=0)fprintf (fp,"%d",ch);
6143 else if ( ch>10)fprintf (fp,"#");
6144 else if ( ch<0)fprintf (fp,".");
6145 else fprintf (fp,"9");
6147 fprintf (fp,"\n\n\n");
6149 fprintf (fp,"\n\n");
6152 FILE * output_aln_with_res_number ( Alignment *B, FILE *fp){return output_Alignment_with_res_number(B, fp);}
6153 FILE * output_Alignment_with_res_number ( Alignment *B, FILE *fp)
6156 static int max_len=0;
6160 if (fp==NULL)return fp;
6163 for ( a=0; a< B->nseq; a++)
6164 {if ( strlen (B->name[a])>max_len)
6165 max_len= strlen ( (B->name[a]));
6170 order=copy_int ( B->order,declare_int ( B->nseq, 2), B->nseq, 2);
6172 fprintf ( fp, "T_COFFEE ALIGNMENT\nCPU TIME:%d sec.\n", (B->cpu+get_time())/1000);
6173 fprintf ( fp, "\n");
6174 for (a=0; a<B->len_aln; a+=line)
6175 {for (b=0; b<B->nseq; b++)
6177 fprintf (fp,"%-*s %3d %4d ",max_len,B->name[b], order[b][0], order[b][1] );
6178 for (c=a;c<a+line && c<B->len_aln;c++)
6180 order[b][1]+=1-is_gap(B->seq_al[b][c]);
6181 fprintf (fp,"%c",toupper(B->seq_al[b][c]) );
6183 fprintf (fp," %4d\n", order[b][1] );
6187 fprintf (fp,"\n\n");
6189 free_int (order, -1);
6193 void output_constraints ( char *fname, char *mode,Alignment *A)
6196 Constraint_list *CL;
6200 if ( !A->CL || strm ( mode, "pdb"))
6207 CL=declare_constraint_list ( A->S, NULL, NULL, 0, NULL, NULL);
6208 CL=aln2constraint_list (A,CL, mode);
6209 compact_list (CL, 0, CL->ne, "default");
6210 fp=save_constraint_list ( CL, 0, CL->ne,fname, NULL, "lib",A->S);
6212 free_constraint_list (CL);
6215 else if ( strncmp ( mode, "extended_pair", 13)==0)
6217 buf=duplicate_string (mode+14);
6219 name_list=vcalloc(2, sizeof(char*));
6220 name_list[0]=strtok (buf,"_");
6221 name_list[1]=strtok (NULL,"_");
6226 compact_list (CL, 0, CL->ne, "default");
6227 fp=save_sub_list_header (vfopen(fname, "w"),2, name_list,CL);
6228 fp=save_extended_constraint_list_pair (CL, "pair",name_list[0],name_list[1],fp);
6229 fp=save_list_footer (fp, CL);
6232 else if ( strm2 (mode, "extended_lib","extended_cosmetic"))
6235 compact_list (CL, 0, CL->ne, "default");
6236 fp=save_extended_constraint_list ( CL,mode+9, vfopen(fname, "w"));
6240 CL=(Constraint_list *)A->CL;
6241 compact_list (CL, 0, CL->ne, "default");
6242 fp=save_constraint_list ( CL, 0, CL->ne,fname, NULL, "lib",A->S);
6246 if ( (Constraint_list *)A->CL !=CL)free_constraint_list (CL);
6251 void output_model_aln (char *fname, Alignment*A )
6259 if ( A->Dp_result==NULL)
6261 fprintf ( stderr, "\nWARNING Could Not Output Model %s [%s]", fname, PROGRAM);
6266 fp=vfopen ( fname, "w");
6267 for (a=0; a<M->nstate; a++)
6269 if (M->model_comments[a][0])fprintf ( fp, "#STATE %c: %s\n", 'a'+a, M->model_comments[a]);
6271 string=vcalloc ( R->len+1, sizeof (char));
6272 for (a=0; a<R->len; a++)string[a]=R->traceback[a]+'a';
6273 fprintf ( fp, ">%s\n",fname);
6274 fp=output_string_wrap ( 50,string, fp);
6276 fprintf ( fp, "\n");
6281 char * output_fasta_sub_aln (char *fname, Alignment*A, int ns, int *ls )
6285 if (fname==NULL)fname=vtmpnam (NULL);
6286 fp=vfopen (fname, "w");
6287 for (a=0; a<ns; a++)
6290 fprintf (fp, ">%s %s\n%s\n", A->name[s],A->seq_comment[s],A->seq_al[s]);
6295 char * output_fasta_sub_aln2 (char *fname, Alignment*A, int *ns, int **ls )
6299 if (fname==NULL)fname=vtmpnam (NULL);
6300 fp=vfopen (fname, "w");
6301 for ( g=0; g<2; g++)
6302 for (a=0; a<ns[g]; a++)
6305 fprintf (fp, ">%s %s\n%s\n", A->name[s],A->seq_comment[s],A->seq_al[s]);
6311 int output_suchard_aln (char *out_file, Alignment *A)
6316 A=back_translate_dna_aln (A);
6318 for ( c=0,a=0; a<A->len_aln; a++, c++)
6321 for (b=0; b<A->nseq; b++)
6325 A->seq_al[b][a]='-';
6329 A=ungap_aln_n (A, 1);
6330 fp=vfopen (out_file, "w");
6331 for ( a=0; a< A->nseq; a++)
6333 for (b=0; b< A->len_aln; b++)
6335 c=tolower(A->seq_al[a][b]);
6337 else if ( c=='g')d=2;
6338 else if ( c=='c')d=3;
6339 else if ( c=='t')d=4;
6340 else if ( c=='u')d=5;
6343 fprintf ( fp, "%d", d);
6345 fprintf ( fp, "\n");
6348 exit (EXIT_SUCCESS);
6351 void output_fasta_aln (char *fname, Alignment*A )
6357 line=get_msa_line_length (line, A->len_aln+1);
6358 fp=vfopen ( fname, "w");
6360 for ( a=0; a< A->nseq; a++)
6362 fprintf ( fp, ">%s", A->name[a]);
6364 if ( A->seq_comment[a][0] && !isblanc (A->seq_comment[a]))fprintf ( fp, " %s", A->seq_comment[a]);
6365 fprintf ( fp, "\n");
6366 fp=output_string_wrap ( line,A->seq_al[a] , fp);
6367 fprintf ( fp, "\n");
6372 void output_pir_aln (char *fname, Alignment*A )
6382 fp=vfopen ( fname, "w");
6383 for ( a=0; a< A->nseq; a++)
6385 if ( strm ( get_string_type (A->seq_al[a]),"DNA") || strm ( get_string_type (A->seq_al[a]),"RNA"))sprintf(type, "DL");
6386 else if ( strm ( get_string_type (A->seq_al[a]),"PROTEIN"))sprintf(type, "P1");
6387 fprintf ( fp, ">%s;%s\n%s\n",type, A->name[a], A->seq_comment[a]);
6388 fp=output_string_wrap ( 50,A->seq_al[a] , fp);
6389 fprintf ( fp, "\n*\n");
6396 int set_landscape_msa (int len)
6398 if ( len==0)landscape_msa=-1;
6403 return landscape_msa;
6405 int get_msa_line_length (int line, int aln_len)
6407 if (landscape_msa==-1) return aln_len;
6408 else if ( landscape_msa)return landscape_msa;
6409 else if (line) return line;
6412 return (getenv ("ALN_LINE_LENGTH"))?atoi(getenv("ALN_LINE_LENGTH")):ALN_LINE_LENGTH;
6416 void output_msf_aln (char *fname,Alignment *B)
6422 long grand_checksum;
6431 line=get_msa_line_length (line, B->len_aln+1);
6434 for ( max_len=0,a=0; a< B->nseq; a++)max_len= MAX(strlen ( B->name[a]),max_len);
6439 fp=vfopen (fname, "w");
6441 seq =vcalloc(B->len_aln, sizeof(char));
6442 all_checks =vcalloc(B->nseq, sizeof(int));
6443 for ( i=0; i< B->nseq; i++)
6445 for ( j=0; j<B->len_aln; j++)
6447 if ( is_gap(B->seq_al[i][j]))seq[j]='.';
6448 else seq[j]=B->seq_al[i][j]=toupper(B->seq_al[i][j]);
6451 all_checks[i] = SeqGCGCheckSum(seq, (int)B->len_aln);
6454 for(i=0; i<B->nseq; i++) grand_checksum += all_checks[i];
6455 grand_checksum = grand_checksum % 10000;
6456 fprintf(fp,"PileUp\n\n");
6458 fprintf(fp,"\n\n MSF:%5d Type: ",B->len_aln);
6459 if(strm ( (B->S)->type, "DNA") || strm ( (B->S)->type, "RNA"))
6463 fprintf(fp," Check:%6ld .. \n\n", (long)grand_checksum);
6464 for (i=0; i< B->nseq; i++)
6466 fprintf ( fp, " Name: %s oo Len:%5d Check:%6ld Weight: %.3f\n", B->name[i], B->len_aln,(long)all_checks[i],(B->S)->W?((B->S)->W)->SEQ_W[i]:1.00);
6468 fprintf(fp,"\n//\n\n");
6470 for (a=0; a<B->len_aln; a+=line)
6472 fprintf ( fp,"\n\n");
6473 for (b=0; b<B->nseq; b++)
6475 fprintf (fp,"%-*s ",max_len,B->name[b]);
6476 for (c_block=0,c=a;c<a+line && c<B->len_aln;c++)
6478 if ( c_block==block)
6484 aa=(is_gap(B->seq_al[b][c]))?'.': toupper(B->seq_al[b][c]);
6485 fprintf (fp,"%c",aa );
6487 if ( c_block==block)
6506 int SeqGCGCheckSum(char *seq, int len)
6511 for( i=0, check=0; i< len; i++,seq++)
6512 check += ((i % 57)+1) * toupper(*seq);
6514 return(check % 10000);
6516 void old_output_msf_aln (char *fname,Alignment *B)
6519 static int *put_seq;
6527 line=get_msa_line_length (line, B->len_aln+1);
6530 for ( max_len=0,a=0; a< B->nseq; a++)max_len= MAX(strlen ( B->name[a]),max_len);
6531 for ( seq_max_len=0,a=0; a< B->nseq; a++)seq_max_len= MAX(strlen ( B->seq_al[a]),max_len);
6534 buf=vcalloc(seq_max_len+1, sizeof (int));
6537 put_seq= vcalloc ( B->nseq, sizeof (int));
6541 for ( b=1; b< B->nseq; b++)
6543 sprintf ( buf, "%s", B->seq_al[b]);
6545 put_seq[b]=( strlen (buf)>0)?1:0;
6548 fp=vfopen ( fname, "w");
6549 fprintf ( fp, "MSF: %d Type P Check: 5083 ..\n", B->len_aln);
6550 for ( a=0; a< B->nseq; a++)
6553 fprintf ( fp,"Name: %s\n",B->name[a]);
6555 fprintf ( fp, "//\n");
6556 for (a=0; a<B->len_aln; a+=line)
6557 {for (b=0; b<B->nseq; b++)
6561 fprintf (fp,"%-*s ",max_len,B->name[b]);
6562 for (c=a;c<a+line && c<B->len_aln;c++)
6567 aa=(B->seq_al[b][c]=='-')?'.': toupper(B->seq_al[b][c]);
6568 fprintf (fp,"%c",aa );
6575 fprintf ( fp,"\n\n");
6582 void output_saga_aln ( char *name, Alignment *B)
6592 line=get_msa_line_length (line, B->len_aln+1);
6596 for ( max_len=0,a=0; a< B->nseq; a++)max_len= (strlen ( B->name[a])>max_len)?(strlen ( B->name[a])):max_len;
6601 fp= vfopen ( name, "w");
6603 fprintf (fp, "\nSAGA FORMAT\nalignement %s nseq=%d len=%d\n", name, B->nseq, B->len_aln);
6605 fprintf (fp, "\n\n");
6606 for (a=0; a<B->len_aln; a+=line)
6607 {for (b=0; b<B->nseq; b++)
6608 {fprintf (fp,"%-*s ",max_len,B->name[b]);
6609 for (c=a;c<a+line && c<B->len_aln;c++)
6611 fprintf (fp,"%c",(B->seq_al[b][c]) );
6617 fprintf (fp,"\n\n");
6620 void output_compact_aln ( char *name, Alignment *B)
6630 line=get_msa_line_length (line, B->len_aln+1);
6633 for ( max_len=0,a=0; a< B->nseq; a++)max_len= (strlen ( B->name[a])>max_len)?(strlen ( B->name[a])):max_len;
6638 fp= vfopen ( name, "w");
6640 fprintf (fp, "\nSAGA FORMAT\nalignement %s nseq=%d len=%d", name, B->nseq, B->len_aln);
6641 fprintf (fp, "\n\n");
6642 for (a=0; a<B->len_aln; a+=line)
6643 {for (b=0; b<B->nseq; b++)
6646 for ( do_print=0, c=a;c<a+line && c<B->len_aln;c++)
6647 do_print+=1-is_gap(B->seq_al[b][c]);
6650 fprintf (fp,"%-*s ",max_len,B->name[b]);
6654 for (c=a;c<a+line && c<B->len_aln;c++)
6656 if ( is_gap(B->seq_al[b][c])&& B->seq_al[b][c]!='-' )fprintf (fp,"%c", '-');
6657 else fprintf (fp,"%c",(B->seq_al[b][c]) );
6664 fprintf (fp,"\n\n");
6668 void output_clustal_aln ( char *name, Alignment *B)
6670 return output_generic_clustal_aln (name, B, "tc_clustal");
6672 void output_strict_clustal_aln ( char *name, Alignment *B)
6674 return output_generic_clustal_aln (name, B, "strict_clustal");
6677 void output_generic_clustal_aln ( char *name, Alignment *B, char *mode)
6685 if ( getenv ("SEP_4_TCOFFEE"))
6687 while ( line<B->len_aln && B->seq_al[0][line]!='o' && B->seq_al[0][line]!='O')line++;
6688 if ( B->seq_al[0][line]=='O' || B->seq_al[0][line]=='o')line++;
6692 while ( line<B->len_aln)line++;
6695 if ( line==B->len_aln)line=get_msa_line_length (0, B->len_aln+1);
6697 n_residues=vcalloc ( B->nseq+1, sizeof (int));
6698 for ( a=0; a< B->nseq; a++)
6699 {if ( strlen (B->name[a])>max_len)
6700 max_len= strlen ( (B->name[a]));
6701 n_residues[a]=B->order[a][1];
6703 max_len=MAX(max_len+2, 16);
6706 fp= vfopen ( name, "w");
6708 if ( strm (mode, "strict_clustal"))
6709 fprintf ( fp, "CLUSTAL W (1.83) multiple sequence alignment");
6711 fprintf (fp, "CLUSTAL FORMAT for %s %s [%s] [MODE: %s ], CPU=%.2f sec, SCORE=%d, Nseq=%d, Len=%d ", PROGRAM, VERSION,URL, retrieve_mode (),(float)(B->cpu+get_time())/1000, B->score_aln, B->nseq, B->len_aln);
6712 fprintf (fp, "\n\n");
6717 for (b=0; b<=B->nseq; b++)
6718 fprintf (fp,"%-*s -\n",max_len, B->name[b]);
6723 for (a=0; a<B->len_aln; a+=line)
6724 {for (b=0; b<=B->nseq; b++)
6728 fprintf (fp,"%-*s",max_len, B->name[b]);
6729 for (c=a;c<a+line && c<B->len_aln;c++)
6731 if ( is_gap(B->seq_al[b][c]))fprintf (fp,"%c", '-');
6735 fprintf (fp, "%c", GET_CASE(B->residue_case, B->seq_al[b][c]));
6740 if (B->output_res_num)fprintf (fp, " %d", n_residues[b]);
6743 else if ( b==B->nseq)
6745 fprintf (fp,"%-*s",max_len," ");
6746 for (c=a;c<a+line && c<B->len_aln;c++)
6748 fprintf ( fp, "%c", analyse_aln_column (B, c));
6756 fprintf (fp,"\n\n");
6760 FILE * output_generic_interleaved_aln (FILE *fp, Alignment *B, int line, char gap, char *mode)
6767 n_residues=vcalloc ( B->nseq+1, sizeof (int));
6768 for ( a=0; a< B->nseq; a++)
6769 {if ( strlen (B->name[a])>max_len)
6770 max_len= strlen ( (B->name[a]));
6771 n_residues[a]=B->order[a][1];
6773 max_len=MAX(max_len+2, 16);
6780 for (b=0; b<=B->nseq; b++)
6781 fprintf (fp,"%-*s -\n",max_len, B->name[b]);
6786 for (a=0; a<B->len_aln; a+=line)
6787 {for (b=0; b<=B->nseq; b++)
6791 fprintf (fp,"%-*s",max_len, B->name[b]);
6792 for (c=a;c<a+line && c<B->len_aln;c++)
6794 if ( is_gap(B->seq_al[b][c]))fprintf (fp,"%c", gap);
6798 fprintf (fp, "%c", GET_CASE(B->residue_case, B->seq_al[b][c]));
6803 if (B->output_res_num)fprintf (fp, " %d", n_residues[b]);
6813 void output_phylip_aln ( char *name, Alignment *B)
6820 line=get_msa_line_length(0, 0);
6822 print_name=vcalloc ( B->nseq, sizeof (int));
6823 fp= vfopen ( name, "w");
6825 fprintf (fp, "%3d %d\n", B->nseq, B->len_aln);
6826 for (a=0; a<B->len_aln; a+=line)
6827 {for (b=0; b<B->nseq; b++)
6828 {if ( print_name[b]==0)
6831 fprintf (fp,"%-10.10s ",B->name[b]);
6836 fprintf (fp, "%10.10s ", " ");
6840 for (d=0,c=a;c<a+line && c<B->len_aln;c++, d++)
6847 if ( is_gap(B->seq_al[b][c])&& B->seq_al[b][c]!='-' )fprintf (fp,"%c", '-');
6848 else fprintf (fp,"%c",(B->seq_al[b][c]) );
6854 fprintf (fp,"\n\n");
6858 void output_rnalign (char *out_file, Alignment *A, Sequence *STRUC)
6862 char bank_file[100];
6866 sprintf ( bank_file, "%s.mss", out_file);
6867 sprintf ( pep_file, "%s.one_rna", out_file);
6870 buf=vcalloc ( strlen ( A->seq_al[0]+1), sizeof (char));
6872 for ( b=0,a=0; a< strlen(A->seq_al[0]); a++)
6874 if ( is_gap(A->seq_al[0][a]))
6877 buf[a]=STRUC->seq[0][b++];
6881 fp=vfopen ( bank_file, "w");
6883 fprintf ( fp, "ST\n");
6884 fp=output_string_wrap ( 50, buf, fp);
6885 fprintf ( fp, "\n\n");
6887 for ( a=0; a<A->nseq-1; a++)
6889 fprintf ( fp, "AS %s\n ", A->name[a]);
6890 fp=output_string_wrap ( 50, A->seq_al[a], fp);
6891 fprintf ( fp, "\n\n");
6894 fp=vfopen ( pep_file, "w");
6895 fprintf ( fp, ">%s\n", A->name[A->nseq-1]);
6896 fp=output_string_wrap ( 50, A->seq_al[A->nseq-1], fp);
6897 fprintf ( fp, "\n");
6901 void output_lib (char *pw_lib_saga_aln_name, Alignment *A )
6904 char fname[VERY_LONG_STRING];
6907 B=declare_Alignment (NULL);
6911 for ( a=0; a< A->nseq-1; a++)
6913 for ( b=a+1; b<A->nseq; b++)
6915 sprintf ( B->seq_al[0], "%s", A->seq_al[a]);
6916 sprintf ( B->name[0], "%s", A->name[a]);
6917 sprintf(B->name[1], "%s", A->name[b]);
6918 sprintf ( B->seq_al[1], "%s",A->seq_al[b]);
6920 sprintf ( fname, "%s_%s_%s.lib",pw_lib_saga_aln_name, A->name[a], A->name[b]);
6922 B->len_aln=strlen ( B->seq_al[0]);
6924 output_clustal_aln (fname,B);
6928 void output_pw_lib_saga_aln (char *pw_lib_saga_aln_name, Alignment *A )
6931 char fname[VERY_LONG_STRING];
6934 B=declare_Alignment (NULL);
6938 for ( a=0; a< A->nseq-1; a++)
6940 for ( b=a+1; b<A->nseq; b++)
6942 sprintf ( B->seq_al[0], "%s", A->seq_al[a]);
6943 sprintf ( B->name[0], "%s", A->name[a]);
6944 sprintf(B->name[1], "%s", A->name[b]);
6945 sprintf ( B->seq_al[1], "%s",A->seq_al[b]);
6947 sprintf ( fname, "%s_%s_%s.pw_lib_saga_aln",pw_lib_saga_aln_name, A->name[a], A->name[b]);
6949 B->len_aln=strlen ( B->seq_al[0]);
6951 output_clustal_aln (fname,B);
6955 void output_lalign_header( char *name, Alignment *A)
6959 fp=vfopen ( name, "w");
6960 fprintf ( fp, " Lalign mode: best local alignments between two sequences\n");
6961 fprintf ( fp, " %s(%s) [%s]\n\n", VERSION, DATE, URL);
6962 fprintf ( fp, " Comparison of:\n(A) %s\t%s\t-%d aa\n", (A->S)->file[A->order[0][0]],(A->S)->name[A->order[0][0]], (A->S)->len[A->order[0][0]]);
6963 fprintf ( fp, "(B) %s\t%s\t-%d aa\n", (A->S)->file[A->order[1][0]],(A->S)->name[A->order[1][0]], (A->S)->len[A->order[1][0]]);
6969 void output_stockholm_aln (char *file, Alignment *A, Alignment *ST)
6974 for (a=0; a<A->nseq; a++)
6975 for (b=0; b<A->len_aln; b++)
6976 if (A->seq_al[a][b]==STOCKHOLM_CHAR)A->seq_al[a][b]='.';
6978 fp=vfopen (file, "w");
6979 fprintf ( fp, "# STOCKHOLM 1.0\n\n");
6980 output_generic_interleaved_aln (fp,A, 50, '.', NULL);
6981 fprintf ( fp, "//\n");
6985 void output_glalign ( char *name, Alignment *B, Alignment *S)
6992 if ( B==NULL){return;}
6994 fp=vfopen (name, "w");
6995 fprintf (fp, "Format: GLALIGN_01 [Generated with %s ]\n", PROGRAM);
6996 fprintf (fp, "#Each Line corresponds to a column\n");
6997 fprintf (fp, "#First column coresponds to first genome\n");
6998 fprintf (fp, "#Last Column gives the column reliability on a 0-9 scale\n");
6999 fprintf (fp, "#[-1] Indicates that the reliability was not evaluated\n");
7001 fprintf (fp, "Genome List\n");
7002 for ( a=0; a< B->nseq; a++)
7003 fprintf (fp, "\tGenome %s\n", B->name[a]);
7004 fprintf (fp, "Alignment List\n");
7007 fprintf (fp, "Alignment %d Len %d Score %d\n", ++naln, B->len_aln, S->score_aln);
7008 nr=duplicate_int (B->order, -1, -1);
7009 for ( a=0; a< B->len_aln; a++)
7011 fprintf ( fp, "\t");
7012 for ( b=0; b< B->nseq; b++)
7014 g=is_gap (B->seq_al[b][a]);
7017 if (g)fprintf (fp, "---- ");
7018 else fprintf ( fp, "%4d ",nr[b][1]);
7020 s=((S)?S->seq_al[S->nseq][a]:-1);
7021 if (s==NO_COLOR_RESIDUE)s=-1;
7022 fprintf ( fp,"[ %d ]",s);
7023 fprintf ( fp, "\n");
7032 Alignment *input_conc_aln ( char *name, Alignment *IN)
7035 char *string, *p, *file;
7036 Alignment *F=NULL,*A=NULL, *B=NULL;
7038 file=vtmpnam (NULL);
7040 string=file2string(name);
7041 string=substitute ( string, "@", "!Protected!");
7042 string=substitute ( string, TC_REC_SEPARATOR, "@");
7043 strtok (string,"@");
7046 while ( (p=strtok (NULL,"@"))!=NULL)
7049 buf=vcalloc ( strlen (p)+1, sizeof (char));
7050 sprintf (buf,"%s", p);
7051 buf=substitute (buf,"!protected!", "@");
7053 fp=vfopen (file, "w");
7054 fprintf ( fp, "%s",buf);
7060 B=main_read_aln (file,NULL);
7064 if (IN){copy_aln (B, IN);F=A=IN;}
7079 void output_conc_aln ( char *name, Alignment *B)
7084 fp=vfopen (name, "w");
7085 fprintf (fp, "# CONC_MSF_FORMAT_01\n");
7088 fprintf (fp, "%s\n", TC_REC_SEPARATOR);
7089 for ( a=0; a< B->nseq; a++)
7091 fprintf ( fp, ">%s\n%s\n", B->name[a], B->seq_al[a]);
7099 void output_lalign ( char *name, Alignment *B)
7101 static int output_header;
7104 if ( B==NULL){output_header=0;return;}
7105 else if ( output_header==0)
7107 output_lalign_header(name, B);
7112 output_lalign_aln ( name, B);
7116 void output_lalign_aln ( char *name, Alignment *B)
7118 int a, b, c,d=0, s=0;
7131 n_residues=vcalloc ( B->nseq+1, sizeof (int));
7132 for ( a=0; a< B->nseq; a++)
7133 {if ( strlen (B->name[a])>max_len)
7134 max_len= strlen ( (B->name[a]));
7135 n_residues[a]=B->order[a][1];
7137 max_len=MAX(max_len+2, 16);
7142 fp= vfopen ( name, "a");
7144 for (a=0; a< B->len_aln; a++)
7146 if ( !is_gap(B->seq_al[0][a]) && !is_gap(B->seq_al[1][a]))
7149 id+=(B->seq_al[0][a]==B->seq_al[1][a]);
7154 fprintf (fp, " %.1f%% identity in %d aa overlap; score: %d\n\n", id,(int)tot, B->score_aln);
7157 for (a=0; a<B->len_aln; a+=line)
7158 {for (b=0; b<5; b++)
7164 fprintf (fp,"%-*s",max_len," ");
7165 for (d=0,c=a;c<a+line && c<B->len_aln;c++)
7167 res=!is_gap ( B->seq_al[s][c]);
7169 if ( (n_residues[s]%10)==0 && res && (c-a+4)<line){fprintf (fp, "%-4d", n_residues[s]);d=-3;}
7172 if ( d==0)fprintf (fp, " ");
7178 else if (b==1 || b==3)
7182 fprintf (fp,"%-*s",max_len, B->name[s]);
7183 for (c=a;c<a+line && c<B->len_aln;c++)
7185 if ( is_gap(B->seq_al[s][c]))fprintf (fp,"%c", '-');
7188 fprintf (fp, "%c", GET_CASE(B->residue_case, B->seq_al[s][c]));
7195 fprintf (fp,"%-*s",max_len," ");
7196 for (c=a;c<a+line && c<B->len_aln;c++)
7198 col=analyse_aln_column (B, c);
7199 if ( col=='*')col=':';
7200 else if ( col==':')col='.';
7201 else if ( col=='.')col=' ';
7202 fprintf ( fp, "%c", col);
7210 fprintf (fp,"\n\n----------\n\n");
7216 /****************************************************************************************************/
7217 /*************************************UTIL *********************************************************/
7218 /**************************************************************************************************/
7221 /****************************************************************************************************/
7222 /*************************** *************************************/
7223 /*************************** PROCESSING *************************************/
7224 /*************************** *************************************/
7225 /*******************************************************************************************/
7229 /***************************************************************************************** */
7231 char *thread_aa_seq_on_dna_seq( char *s)
7238 array=vcalloc ( l*3 +1, sizeof (char));
7239 for ( b=0, c=0; b< l; b++, c+=3)
7249 Alignment *thread_dnaseq_on_prot_aln (Sequence *S, Alignment *A)
7252 int a, b, c, n, la, ls, ln, m;
7255 B=realloc_aln2 ( B, B->nseq, B->len_aln*3 +1);
7257 for ( n=0,a=0; a< A->nseq; a++)
7259 for ( m=0,b=0; b< S->nseq; b++)
7261 if (strm (A->name[a], S->name[b]) )
7266 B->seq_al[a][0]='\0';
7267 for (la=0, ls=0, ln=0; la< A->len_aln; la++)
7269 for (c=0; c< 3; c++)
7270 B->seq_al[a][ls++]=(is_gap(A->seq_al[a][la]))?'-':S->seq[b][ln++];
7272 B->seq_al[a][ls]='\0';
7277 for (la=0, ls=0, ln=0; la< A->len_aln; la++)
7280 B->seq_al[a][ls++]=A->seq_al[a][la];
7281 B->seq_al[a][ls++]='-';
7282 B->seq_al[a][ls++]='-';
7287 B->len_aln=strlen ( B->seq_al[0]);
7290 void thread_seq_struc2aln ( Alignment *A, Sequence *ST)
7295 for ( a=0; a< A->nseq; a++)
7296 for ( b=0; b< ST->nseq; b++)
7298 if ( strcmp ( A->name[a], ST->name[b])==0)
7301 len=strlen(A->seq_al[a]);
7302 for ( c=0, d=0; c<len; c++)
7304 if ( !is_gap(A->seq_al[a][c]))A->seq_al[a][c]=ST->seq[b][d++];
7308 cons=name_is_in_list ("Cons", ST->name, ST->nseq, 100);
7309 if ( cons!=-1 && A->len_aln==strlen ( ST->seq[cons]))
7311 sprintf (A->name[A->nseq], "Cons");
7312 sprintf (A->seq_al[A->nseq],"%s", ST->seq[cons]);
7316 void cache_id ( Alignment *A)
7321 for ( a=0; a< A->len_aln; a++)
7323 for ( b=0, n=0; b< A->nseq; b++)if ( !is_gap(A->seq_al[b][a]))n++;
7324 for ( b=0; b< A->nseq; b++)
7325 if ( !is_gap(A->seq_al[b][a]) && n==A->nseq)A->seq_al[b][a]='h';
7326 else if( !is_gap(A->seq_al[b][a]))A->seq_al[b][a]='x';
7328 for ( a=0; a< A->nseq; a++)
7330 for ( b=1; b< A->len_aln-1; b++)
7332 r1=A->seq_al[a][b-1];
7334 r3=A->seq_al[a][b+1];
7337 if ( (r1=='h' || r1=='b') && (r3=='h' || r3=='b'))A->seq_al[a][b]='h';
7338 else A->seq_al[a][b]='b';
7341 for ( b=1; b< A->len_aln-1; b++)if ( A->seq_al[a][b]=='b')A->seq_al[a][b]='x';
7347 /*******************************************************************************************/
7350 /* PROCESING OF EST */
7352 /***************************************************************************************** */
7353 int process_est_sequence ( Sequence *S, int *cluster_list)
7355 char **inverted_seq;
7363 char buf [VERY_LONG_STRING];
7367 sens=declare_int ( S->nseq,S->nseq);
7368 a_sens=declare_int ( S->nseq,S->nseq);
7369 best=declare_int ( S->nseq,S->nseq);
7372 inverted_seq=vcalloc ( S->nseq, sizeof (char*));
7373 for ( a=0; a<S->nseq; a++)
7374 inverted_seq[a]=invert_seq ( S->seq[a]);
7376 for ( a=0; a< S->nseq-1; a++)
7379 for ( b=a+1; b<S->nseq; b++)
7382 V1=sens[a][b]=sens[b][a]=get_best_match ( S->seq[a], S->seq[b]);
7383 V2=a_sens[a][b]=a_sens[b][a]=get_best_match ( S->seq[a],inverted_seq[b]);
7384 best[a][b]=best[b][a]=(V1>V2)?V1:V2;
7387 solution=SHC ( S->nseq, a_sens, sens);
7390 for ( a=0; a<S->nseq; a++)cluster_list[a]=-1;
7391 for ( a=0; a<S->nseq; a++)
7393 n=search_for_cluster (a, n_clusters, cluster_list, T, S->nseq, best);
7394 if ( n>0)n_clusters++;
7396 fprintf ( stderr, "\nTHERE %s %d Independant Cluster(s) in your sequences",(n_clusters>1)?"are":"is",(n_clusters));
7397 for (a=0; a<n_clusters; a++)
7399 fprintf (stderr, "\n");
7400 for ( b=0; b<S->nseq; b++)
7402 if ( cluster_list[b]==a)fprintf ( stderr, "%s ", S->name[b]);
7406 for ( a=0; a<S->nseq; a++)
7408 if ( solution[a]==-1)
7410 S->seq[a]=inverted_seq[a];
7411 sprintf ( buf, "i_%s", S->name[a]);
7412 sprintf ( S->name[a], "%s", buf);
7418 int search_for_cluster ( int seq, int cluster_number, int *cluster_list, int T, int nseq, int **S)
7422 if (cluster_list[seq]==-1)
7424 cluster_list[seq]=cluster_number;
7427 for ( a=0; a<nseq; a++)
7428 if ( cluster_list[a]==-1)
7434 cluster_list[a]=cluster_number;
7435 n+=search_for_cluster ( a, cluster_number, cluster_list, T, nseq, S);
7441 int * SHC ( int nseq, int **NST, int **ST)
7445 int score, new_score;
7446 int N_IT=VERY_LONG_STRING;
7450 sol=vcalloc ( nseq, sizeof (int));
7451 for ( a=0; a<nseq; a++)
7452 sol[a]=(addrand ((unsigned long)100)>49)?1:-1;
7454 score=evaluate_sol (sol, nseq, ST, NST);
7455 fprintf ( stderr, "\nI_Score=%d\n", score);
7458 for ( count=0,a=0; a< N_IT && score<VERY_LONG_STRING; a++, count++)
7460 mut=mutate_sol ( sol,nseq);
7461 new_score=evaluate_sol (sol, nseq, ST, NST);
7462 if ( new_score>score)
7466 else if ( (addrand ((unsigned long)VERY_LONG_STRING))>score)
7471 sol[mut]=sol[mut]*-1;
7472 if ( count==VERY_LONG_STRING)
7475 fprintf ( stderr, "\nScore=%d", score);
7478 fprintf ( stderr, "\nScore=%d\n", score);
7482 int mutate_sol (int *sol, int nseq)
7485 n=addrand ((unsigned long)nseq);
7489 int evaluate_sol ( int *sol, int nseq, int **ST, int **NST)
7491 static int max_score;
7496 for ( a=0; a<nseq-1; a++)
7497 for ( b=a+1; b<nseq; b++)
7499 max_score+=(ST[a][b]>NST[a][b])?ST[a][b]:NST[a][b];
7503 for ( a=0; a<nseq-1; a++)
7504 for (b=a+1; b<nseq; b++)
7505 if ( (sol[a]*sol[b])<0)score+=NST[a][b];
7506 else score+=ST[a][b];
7507 return (score*VERY_LONG_STRING)/max_score;
7511 char * invert_seq ( char *seq)
7520 for ( a=0; a<l; a++)
7521 seq[a]=tolower ( seq[a]);
7522 nseq=vcalloc ( l+1, sizeof (char));
7524 for ( a=0, b=l-1; a<l; a++, b--)
7526 if (seq[b]=='n')nseq[a]='n';
7527 else if (seq[b]=='g')nseq[a]='c';
7528 else if (seq[b]=='c')nseq[a]='g';
7529 else if (seq[b]=='a')nseq[a]='t';
7530 else if (seq[b]=='t')nseq[a]='a';
7538 int get_best_match ( char *seq1, char *seq2)
7554 m=declare_int (ml, ml);
7556 else if ( (ml<l1) || (ml<l2))
7560 m=declare_int (ml, ml);
7563 for ( a=0; a<l1; a++)
7565 for ( b=0; b<l2; b++)
7566 m[a][b]=((seq1[a]==seq2[b])|| seq1[a]=='n' ||seq2[b]=='n')?1:0;
7568 mdiag= extract_m_diag_streches ( m, l1, l2,seq1, seq2, &n_mdiag);
7570 for ( best=0,a=0; a<n_mdiag; a++)
7571 best=(mdiag[a][0]>best)?mdiag[a][0]:best;
7576 int** extract_m_diag_streches ( int ** m, int l1, int l2,char *seq1, char *seq2, int *n_mdiag)
7579 int b, x, y, s1, s2;
7582 static int max_diag=VERY_LONG_STRING;
7593 mdiag=declare_int ( max_diag, 5);
7595 for ( s1=l1-1, s2=0;s2<l2;)
7597 for ( in=0,x=s1, y=s2; x<l1 && y<l2; x++, y++)
7602 mdiag[n_mdiag[0]][0]++;
7605 mdiag[n_mdiag[0]][0]=1;
7606 mdiag[n_mdiag[0]][1]=x;
7607 mdiag[n_mdiag[0]][2]=y;
7615 mdiag[n_mdiag[0]][3]=x-1;
7616 mdiag[n_mdiag[0]][4]=y-1;
7617 if ( !is_strech ( "ta", seq1, seq2,mdiag[n_mdiag[0]][0], mdiag[n_mdiag[0]][1],mdiag[n_mdiag[0]][2]))n_mdiag[0]++;
7619 if (n_mdiag[0]==(max_diag-1))
7620 {mdiag=vrealloc (mdiag, (max_diag+VERY_LONG_STRING)*sizeof (int*));
7621 for ( b=max_diag; b<max_diag+VERY_LONG_STRING; b++)mdiag[b]=vcalloc ( 5, sizeof (int));
7622 max_diag+=VERY_LONG_STRING;
7630 mdiag[n_mdiag[0]][3]=x-1;
7631 mdiag[n_mdiag[0]][4]=y-1;
7632 if ( !is_strech ( "ta", seq1, seq2,mdiag[n_mdiag[0]][0], mdiag[n_mdiag[0]][1],mdiag[n_mdiag[0]][2]))n_mdiag[0]++;
7638 int is_strech ( char *AA, char *seq1, char *seq2, int len, int x, int y)
7640 int n, i, j, c,a,nr;
7644 for ( a=0; a<n; a++)
7646 for (nr=0, i=x, j=y, c=0; c<len; c++, i++, j++)
7647 if ((seq1[i]==AA[a]) && (seq2[j]==AA[a]))nr++;
7648 if ( ((nr*100)/len)>T)return 1;
7654 /************************************************************************************/
7659 /************************************************************************************/
7661 char * oneletaa2threeletaa(char aa);
7662 float aa2property (char aa, char *mode);
7664 int output_seq2struc(char *outfile, Alignment *A)
7668 float v, h, x, y, z, dx, dy, dz;
7670 char *tmpfile1, *tmpfile2;
7673 tmpfile1=vtmpnam(NULL);
7674 tmpfile2=vtmpnam(NULL);
7676 ungap (A->seq_al[0]);
7677 s=A->seq_al[0];l=strlen (s);
7678 fp1=vfopen (tmpfile1, "w");
7681 for ( a=0; a< l; a++)
7683 h=aa2property ( s[a], "doolittle" );
7684 v=aa2property (s[a], "volume");
7685 /*14.398907: peptide bond length*/
7686 dx=(float)sqrt ((double)(14.398907/(((h*h)/(v*v))+1)));
7694 fprintf (fp1, "ATOM%7d CA %s A%4d%12.3f%8.3f%8.3f 1.00 5.30\n",a+1, oneletaa2threeletaa(s[a]),a+1, x, y, z);
7697 sprintf ( command, "extract_from_pdb -infile %s -force > %s", tmpfile1, tmpfile2);
7698 my_system (command);
7699 fp1=vfopen (tmpfile2, "r");
7700 fp2=vfopen (outfile, "w");
7702 while ( (c=fgetc(fp1))!=EOF)fprintf (fp2, "%c", c);
7709 char * oneletaa2threeletaa(char aa)
7712 if ( aa=='a')return "ALA";
7713 else if ( aa=='r') return "ARG";
7714 else if ( aa=='n') return "ASN";
7715 else if ( aa=='d') return "ASP";
7716 else if ( aa=='c') return "CYS";
7717 else if ( aa=='q') return "GLN";
7718 else if ( aa=='e') return "GLU";
7719 else if ( aa=='g') return "GLY";
7720 else if ( aa=='h') return "HIS";
7721 else if ( aa=='i') return "ILE";
7722 else if ( aa=='l') return "LEU";
7723 else if ( aa=='k') return "LYS";
7724 else if ( aa=='m') return "MET";
7725 else if ( aa=='f') return "PHE";
7726 else if ( aa=='p') return "PRO";
7727 else if ( aa=='s') return "SER";
7728 else if ( aa=='t') return "THR";
7729 else if ( aa=='w') return "TRP";
7730 else if ( aa=='y') return "TYR";
7731 else if ( aa=='v') return "VAL";
7734 fprintf ( stderr, "\nERROR: %c is not an amino acid [FATAL::aa2hydropathy::%s]", aa, PROGRAM);
7735 myexit (EXIT_FAILURE);
7741 float aa2property (char aa, char *mode)
7743 if ( mode==NULL || strm (mode, "doolittle"))
7746 if ( aa=='i')return 4.5;
7747 else if ( aa=='v') return 4.2;
7748 else if ( aa=='l') return 3.8;
7749 else if ( aa=='f') return 2.8;
7750 else if ( aa=='c') return 2.5;
7751 else if ( aa=='m') return 1.9;
7752 else if ( aa=='a') return 1.8;
7753 else if ( aa=='g') return -0.4;
7754 else if ( aa=='t') return -0.7;
7755 else if ( aa=='w') return -0.9;
7756 else if ( aa=='s') return -0.8;
7757 else if ( aa=='y') return -1.3;
7758 else if ( aa=='p') return -1.6;
7759 else if ( aa=='h') return -3.2;
7760 else if ( aa=='e') return -3.5;
7761 else if ( aa=='q') return -3.5;
7762 else if ( aa=='d') return -3.5;
7763 else if ( aa=='n') return -3.5;
7764 else if ( aa=='k') return -3.9;
7765 else if ( aa=='r') return -4.5;
7768 fprintf ( stderr, "\nERROR: %c is not an amino acid [FATAL::aa2hydropathy::%s]", aa, PROGRAM);
7769 myexit (EXIT_FAILURE);
7772 else if (strm (mode, "volume"))
7775 if ( aa=='a')return 0.915;
7776 else if ( aa=='r') return 2.02;
7777 else if ( aa=='n') return 1.35;
7778 else if ( aa=='d') return 1.24;
7779 else if ( aa=='c') return 1.18;
7780 else if ( aa=='q') return 1.61;
7781 else if ( aa=='e') return 1.55;
7782 else if ( aa=='g') return 0.66;
7783 else if ( aa=='h') return 1.67;
7784 else if ( aa=='i') return 1.69;
7785 else if ( aa=='l') return 1.68;
7786 else if ( aa=='k') return 1.71;
7787 else if ( aa=='m') return 1.70;
7788 else if ( aa=='f') return 2.03;
7789 else if ( aa=='p') return 1.29;
7790 else if ( aa=='s') return 0.99;
7791 else if ( aa=='t') return 1.22;
7792 else if ( aa=='w') return 2.37;
7793 else if ( aa=='y') return 2.03;
7794 else if ( aa=='v') return 1.41;
7797 fprintf ( stderr, "\nERROR: %c is not an amino acid [FATAL::aa2hydropathy::%s]", aa, PROGRAM);
7798 myexit (EXIT_FAILURE);
7804 fprintf ( stderr, "\nERROR: %s is an unknown mode [FATAL::aa2hydropathy::%s]", mode , PROGRAM);
7805 myexit (EXIT_FAILURE);
7814 /************************************************************************************/
7819 /************************************************************************************/
7821 Alignment *code_dna_aln (Alignment *A)
7825 for ( a=0; a< A->nseq; a++)
7827 for (l=0, b=0; b< A->len_aln; b++)
7831 else if ( r=='~')continue;
7832 else if ( r=='.')l++;
7833 else if ( !islower(r))A->seq_al[a][b]='4';
7836 A->seq_al[a][b]=(l+3)%3+'0';
7845 Alignment *back_translate_dna_aln (Alignment *A)
7847 /*Given a set of aligned sequences
7848 starts from left to right
7850 ambiguities are randomly resolved.
7851 returns the corresponding amino acid alignment
7857 A=realloc_aln (A, 10000);
7858 seq=vcalloc ( 10000, sizeof (char));
7861 for ( a=0; a< A->nseq; a++)
7863 seq=back_translate_dna_seq (A->seq_al[a], seq, RANDOM);
7864 sprintf ( A->seq_al[a], "%s", seq);
7866 A->len_aln=A->len_aln*3;
7871 char * back_translate_dna_seq ( char *in_seq,char *out_seq, int mode)
7877 if (out_seq==NULL)out_seq=vcalloc ( len*3+1, sizeof (char));
7880 for (a=0; a<len; a++)
7882 strcat (out_seq, back_translate_dna_codon (in_seq[a],mode));
7888 static Sequence *rna_seq2dna_seq (Sequence *S);
7889 static Sequence *dna_seq2rna_seq (Sequence *S);
7890 Sequence * transform_sequence ( Sequence *S, char *mode)
7892 if ( strm (mode, "rna2dna"))
7893 return rna_seq2dna_seq (S);
7894 else if ( strm (mode, "dna2rna"))
7895 return dna_seq2rna_seq (S);
7897 printf_exit (EXIT_FAILURE, stderr, "Unknown -transform mode: %s [FATAL:%s]\n", mode,PROGRAM);
7900 Sequence *rna_seq2dna_seq (Sequence *S)
7904 if ( !strm(S->type, "DNA") && !strm (S->type, "RNA")) printf_exit (EXIT_FAILURE, stderr, "Sequences should be *RNA* type [FATAL:%s]\n", PROGRAM);
7905 for ( a=0; a<S->nseq; a++)
7907 for (b=0; b<strlen (S->seq[a]); b++)
7909 if ( S->seq[a][b]=='u') S->seq[a][b]='t';
7910 if ( S->seq[a][b]=='U') S->seq[a][b]='T';
7912 HERE ("%s", S->seq[a]);
7916 Sequence *dna_seq2rna_seq (Sequence *S)
7920 if ( !strm(S->type, "DNA") && !strm (S->type, "RNA")) printf_exit (EXIT_FAILURE, stderr, "Sequences should be *DNA* type (type=%s) [FATAL:%s]\n", PROGRAM, S->type);
7921 for ( a=0; a<S->nseq; a++)
7922 for (b=0; b<S->len[a]; b++)
7924 if ( S->seq[a][b]=='t') S->seq[a][b]='u';
7925 if ( S->seq[a][b]=='T') S->seq[a][b]='U';
7932 int get_longest_frame (char *seq, int mode);
7933 Alignment *translate_dna_aln (Alignment *A, int frame)
7935 /*Given a set of aligned sequences
7936 starts from left to right
7938 2nuc+1gap, 1nuc+2gap->3 gaps
7940 returns the corresponding amino acid alignment
7947 if (frame==3 || frame ==4)
7950 for (a=0; a< A->nseq; a++)
7954 f=get_longest_frame (d,frame);
7955 buf=vcalloc ( strlen (d)+1, sizeof (char));
7958 sprintf (buf, "%s", d+f);
7959 sprintf (d, "%s", buf);
7960 sprintf (A->seq_comment[a], " frame: %d", f);
7965 sprintf ( buf, "%s", d);
7966 buf=complement_string (buf);
7967 sprintf (d, "%s",buf+f);
7968 sprintf (A->seq_comment[a], " frame: %d Reverse Complement", f);
7976 for ( a=0; a< A->nseq; a++)
7977 for (b=0; b< frame; b++)
7978 A->seq_al[a][b]='-';
7982 for ( b=0; b< A->nseq; b++)
7983 for ( a=0; a< A->len_aln;)
7986 r=translate_dna_codon (A->seq_al[b]+a, 'z');
7989 A->seq_al[b][a++]='-';
7990 A->seq_al[b][a++]='-';
7991 A->seq_al[b][a++]='-';
7995 A->seq_al[b][a++]='o';
7996 A->seq_al[b][a++]='-';
7997 A->seq_al[b][a++]='-';
8001 A->seq_al[b][a++]='x';
8002 A->seq_al[b][a++]='-';
8003 A->seq_al[b][a++]='-';
8007 A->seq_al[b][a++]=r;
8008 A->seq_al[b][a++]='-';
8009 A->seq_al[b][a++]='-';
8017 int get_longest_frame (char *in_seq, int mode)
8025 seq=vcalloc (strlen (in_seq)+1, sizeof (char));
8026 prot=vcalloc (strlen (in_seq)+1, sizeof (char));
8027 sprintf ( seq, "%s", in_seq);
8029 if ( mode == 3)nf=3;
8030 else if ( mode == 4) nf=6;
8032 for (a=0; a<nf; a++)
8035 if (a==3)seq=complement_string (seq);
8037 prot=translate_dna_seq ( seq,f,'\0', prot);
8039 if (l>=max_l){max_l=l;best_frame=a;}
8046 Alignment *clean_gdna_aln (Alignment *A)
8048 int a, b, c, r1, r2,s, p, n, tn;
8056 /*Viterbi Parameters*/
8057 int AL=0; /*Allowed Transition*/
8058 int F=-1000000; /*Forbiden Transition*/
8059 int SPLICE_PENALTY=100;
8060 int ORF1=0, ORF2=1, ORF3=2, NC=3;
8062 int state, pstate, best_e, best_pstate_p,best_state_p, best_pstate_v, best_state_v, v;
8070 best_state_p=best_state_v=best_pstate_p=best_pstate_v=best_e=0;
8071 buffer=vcalloc ( 100000, sizeof (char));
8072 is_dna=vcalloc ( A->nseq, sizeof (int));
8073 score=declare_int ( A->nseq+1, A->len_aln);
8076 if ( !mat)mat=read_matrice("pam250mt");
8078 col=vcalloc ( A->nseq, sizeof (int));
8080 for (a=0; a<= A->len_aln; a++)
8081 for ( b=0; b< A->nseq; b++){A->seq_al[b][a]=tolower(A->seq_al[b][a]); A->seq_al[b][a]=(A->seq_al[b][a]=='t')?'u':A->seq_al[b][a];}
8083 for ( a=0; a< A->nseq; a++)
8085 sprintf ( buffer, "%s", A->seq_al[a]);
8087 is_dna[a]=strm ( get_string_type (buffer), "DNA");
8091 for (a=0; a< A->len_aln-2; a++)
8093 for (b=0; b< A->nseq; b++)
8095 if (is_dna[b])col[b]=translate_dna_codon (A->seq_al[b]+a, 'x');
8096 else col[b]=tolower ( A->seq_al[b][a]);
8099 for (n=0,tn=0,b=0; b< A->nseq; b++)
8100 for ( c=b; c< A->nseq; c++ )
8105 if (r1=='x' || r2=='x'){score[A->nseq][a]=F;break;}
8106 else if (r1=='-' && r2=='-');
8107 else if (r1=='-' || r2=='-');
8111 if ( is_dna[b] && is_dna[c])score[A->nseq][a]+= mat[r1-'A'][r2-'A'];
8112 else score[A->nseq][a]+=mat[r1-'A'][r2-'A']* (A->nseq*A->nseq);
8114 n+=( !is_gap(r1) && !is_gap(r2));
8115 score[A->nseq][a]=(((tn!=0)?score[A->nseq][a]/tn:0));
8122 transitions=declare_int ( nstate, nstate);
8123 v_tab=declare_int ( A->len_aln+2, nstate );
8124 v_tab_p=declare_int ( A->len_aln+2, nstate );
8126 for (a=0; a<nstate;a++)
8127 for (b=0; b<nstate;b++)
8128 {transitions[a][b]=F;}
8130 transitions[ORF1][ORF2]=AL;
8131 transitions[ORF2][ORF3]=AL;
8132 transitions[ORF3][ORF1]=AL;
8134 transitions[ORF3][NC] =AL-SPLICE_PENALTY;
8135 transitions[NC][ORF1] =AL-SPLICE_PENALTY;
8138 for ( s=0; s<A->nseq; s++)
8140 for ( p=0; p<=A->len_aln; p++){for (state=0; state< nstate; state++)v_tab_p[p][state]=-1; }
8141 for (p=1+2; p<= A->len_aln; p++)
8144 for (state=0; state< nstate; state++)
8147 if ( state==NC){e=-best_e;}
8150 e=score[A->nseq][(p-1)-state];
8151 if ( state==0)best_e=e;
8152 else best_e=MAX(e, best_e);
8155 for ( pstate=0; pstate<nstate; pstate++)
8157 v=e+transitions[pstate][state]+v_tab[p-1][pstate];
8158 if (pstate==0 ||(v>best_pstate_v) )
8161 best_pstate_p=pstate;
8165 v_tab[p][state]=best_pstate_v;
8166 v_tab_p[p][state]=best_pstate_p;
8167 if (state==0 ||best_pstate_v>best_state_v )
8170 best_state_v=best_pstate_v;
8178 for (p=0; p< A->len_aln; p++)T->seq_al[s][p]='.';
8179 for (p=A->len_aln; p>0; p--)
8182 if ( best_state_p==0)T->seq_al[s][p-1]=translate_dna_codon (A->seq_al[s]+(p-1), 'x');
8183 else if ( best_state_p==1 || best_state_p==2)T->seq_al[s][p-1]='-';
8187 best_state_p=v_tab_p[p][best_state_p];
8198 Alignment *clean_cdna_aln (Alignment *A)
8200 /*Given an alignmnet of nucleotides
8201 Returns the same alignmnent whith non coding nucleotides replaced with dots
8203 at each position, the emission probability is the sum of pair of the substitution of amino-acids
8216 /*Viterbi Parameters*/
8217 int AL=0; /*Allowed Transition*/
8218 int F=-1000000; /*Forbiden Transition*/
8220 int NC, C1,C2, C3, START, END;
8222 int state=0,best_state=0, score=0, best_score=0;
8241 buffer=vcalloc ( 100000, sizeof (char));
8242 emission=vcalloc (A->len_aln, sizeof (int));
8246 mat=read_matrice("pam250mt");
8249 /*Computation of the emission proba for the coding state*/
8252 for (a=0; a< A->len_aln; a++)
8255 /*First component: % occupancy of the column*/
8257 for ( b=0; b< A->nseq; b++) em1+=!is_gap(translate_dna_codon (A->seq_al[b]+a, '-'));
8258 em1=em1/(float)A->nseq;
8260 /*Second Component: % similarity within column*/
8262 for (n=0,b=0; b< A->nseq-1; b++)
8264 r1=translate_dna_codon (A->seq_al[b]+a, '-');
8266 for (c=b+1; c<A->nseq; c++)
8268 r2=translate_dna_codon (A->seq_al[c]+a, '-');
8269 if (is_gap(r2) || is_gap(r1));
8273 em2+=((mat[r1-'A'][r2-'A'])>1)?1:0;
8277 em2=em2/(float)((n==0)?1:n);
8280 emission[a]=(em1*100);
8288 transitions=declare_int ( nstate, nstate);
8289 score_tab=declare_int ( A->len_aln+2, nstate );
8290 state_tab=declare_int ( A->len_aln+2, nstate );
8292 for (a=0; a<nstate;a++)
8293 for (b=0; b<nstate;b++)
8294 {transitions[a][b]=F;}
8297 transitions[START][C1]=AL;
8298 transitions[START][NC]=AL;
8299 transitions[C3][END]=AL;
8300 transitions[NC][END]=AL;
8301 transitions[C1 ][C2 ]=AL;
8302 transitions[C2 ][C3 ]=AL;
8303 transitions[C3 ][C1 ]=AL;
8304 transitions[C3 ][NC ]=AL-PENALTY;
8305 transitions[NC ][C1 ]=AL-PENALTY;
8306 transitions[NC][NC]=AL-PENALTY;
8310 for ( s=0; s< A->nseq; s++)
8312 for ( p=0; p<=A->len_aln; p++){for (state=0; state< nstate; state++){score_tab[p][state]=F;state_tab[p][state]=-1;} }
8313 score_tab[0][START]=0;
8315 for (p=1; p<= A->len_aln; p++)
8317 for (state=0; state< nstate; state++)
8319 if ( state==START || state==END)continue;
8320 else if ( state==NC) e=-10;
8321 else if ( state==C1)
8325 else if ( state ==C2)
8328 else e=emission[p-2];
8330 else if ( state==C3)
8333 else e=emission[p-3];
8336 for (p_state=0; p_state<nstate; p_state++)
8342 score=(score_tab[p-1][p_state]==F)?F:(e+transitions[p_state][state]+score_tab[p-1][p_state]);
8345 if(p_state==0 || score>best_score){ best_score=score;best_state=p_state;}
8349 score_tab[p][state]=best_score;
8350 state_tab[p][state]=best_state;
8355 best_score=best_state=UNDEFINED;
8356 for (state=0; state<nstate; state++)
8358 if (state==START || state==END)continue;
8359 e=transitions[state][END];
8360 if (e==F || score_tab[p-1][state]==F)continue;
8362 if (best_score==UNDEFINED || score_tab[p-1][state]>best_score)
8364 best_score=score_tab[p-1][state]+e;
8370 for (p=A->len_aln; p>0;)
8372 B->seq_al[s][p-1]=best_state+'0';
8373 best_state=state_tab[p][best_state];
8378 for ( a=0; a< A->nseq; a++)
8379 for ( b=0; b< A->len_aln;)
8385 r2=A->seq_al[a][b+1];
8386 r3=A->seq_al[a][b+2];
8389 if ( is_gap(r1) ||is_gap(r2) || is_gap(r3))
8391 A->seq_al[a][b]=(is_gap(r1))?'~':'.';
8392 A->seq_al[a][b+1]=(is_gap(r2))?'~':'.';
8393 A->seq_al[a][b+2]=(is_gap(r3))?'~':'.';
8397 else if ( s==NC+'0')
8399 A->seq_al[a][b]=(is_gap(A->seq_al[a][b]))?'~':'.';
8404 fprintf (stderr, "\nPROBLEM: [%d %d]->%d", a, b, s-'0');
8410 free_int (transitions, -1);
8411 free_int (score_tab, -1);
8412 free_int (state_tab, -1);
8422 Alignment *translate_splice_dna_aln (Alignment *A, Alignment *ST)
8424 int a, b, c, r1, r2,s, p, n, tn;
8430 /*Viterbi Parameters*/
8431 int AL=0; /*Allowed Transition*/
8432 int F=-1000000; /*Forbiden Transition*/
8433 int ORF1=0, ORF2=1, ORF3=2,SPL1=3, SPL2=4, SPL3=5, SPL4=6, NC=7;
8435 int frame1, frame2, frame3, best_frame;
8441 int state=0, pstate=0, best_pstate_p=0,best_state_p=0, best_pstate_v=0, best_state_v=0, v=0;
8448 score=declare_int ( A->nseq+1, A->len_aln);
8451 if ( !mat)mat=read_matrice("pam250mt");
8453 col=vcalloc ( A->nseq, sizeof (int));
8455 for (a=0; a<= A->len_aln; a++)
8456 for ( b=0; b< A->nseq; b++){A->seq_al[b][a]=tolower(A->seq_al[b][a]); A->seq_al[b][a]=(A->seq_al[b][a]=='t')?'u':A->seq_al[b][a];}
8461 for (a=0; a< A->len_aln-2; a++)
8463 for (b=0; b< A->nseq; b++)
8465 col[b]=translate_dna_codon (A->seq_al[b]+a, 'x');
8468 for (n=0,tn=0,b=0; b< A->nseq-1; b++)
8469 for ( c=b+1; c< A->nseq; c++, tn++ )
8474 if (r1=='x' || r2=='x')score[A->nseq][a]=F;
8475 else if (r1=='-' && r2=='-');
8476 else if (r1=='-' || r2=='-');
8479 score[A->nseq][a]+= mat[r1-'A'][r2-'A'];
8482 n+=( !is_gap(r1) && !is_gap(r2));
8484 score[A->nseq][a]=(((tn!=0)?score[A->nseq][a]/tn:0));
8490 transitions=declare_int ( nstate, nstate);
8491 v_tab=declare_int ( A->len_aln+2, nstate*nstate);
8492 v_tab_p=declare_int ( A->len_aln+2, nstate*nstate);
8494 for (a=0; a<nstate;a++)
8495 for (b=0; b<nstate;b++)
8496 {transitions[a][b]=F;}
8498 SPLICE_PENALTY=-1000;
8500 transitions[ORF1][ORF2] =AL;
8501 transitions[ORF1][SPL1] =AL-SPLICE_PENALTY;
8503 transitions[ORF2][ORF3] =AL;
8504 transitions[ORF2][SPL1] =AL-SPLICE_PENALTY;
8506 transitions[ORF3][ORF1] =AL;
8507 transitions[ORF3][SPL1] =AL-SPLICE_PENALTY;
8509 transitions[ORF3][ORF1] =AL;
8510 transitions[ORF3][SPL1] =AL-SPLICE_PENALTY;
8512 transitions[ORF3][NC]=AL-100;
8513 transitions[NC][ORF1]=AL-100;
8516 transitions[SPL1][SPL2]=AL;
8517 transitions[SPL2][NC ]=AL-SPLICE_PENALTY;
8518 transitions[NC ][NC ]=AL;
8519 transitions[NC ][SPL3]=AL-SPLICE_PENALTY;
8520 transitions[SPL3][SPL4]=AL;
8521 transitions[SPL4][ORF1]=AL;
8522 transitions[SPL4][ORF2]=AL;
8523 transitions[SPL4][ORF3]=AL;
8526 for ( s=0; s<A->nseq; s++)
8528 for ( p=0; p<=A->len_aln; p++){for (state=0; state< nstate; state++)v_tab_p[p][state]=-1; }
8529 for (p=1+2; p<= A->len_aln; p++)
8531 frame1=score[A->nseq][(p-1)];
8532 frame2=score[A->nseq][(p-1)-1];
8533 frame3=score[A->nseq][(p-1)-2];
8534 best_frame=best_int (3, 1, &a, frame1, frame2, frame3);
8535 for (state=0; state< nstate; state++)
8537 r=tolower (A->seq_al[s][p-1]);
8540 if (state==ORF1)e=frame1;
8541 else if (state==ORF2)e=frame2;
8542 else if (state==ORF3)e=frame3;
8543 else if (state==SPL1)e=(r=='g')?best_frame:F;
8544 else if (state==SPL2)e=(r=='t')?best_frame:F;
8545 else if (state==SPL3)e=(r=='a')?best_frame:F;
8546 else if (state==SPL4)e=(r=='g')?best_frame:F;
8547 else if (state==NC)e=-best_frame;
8548 for ( pstate=0; pstate<nstate; pstate++)
8550 v=e+transitions[pstate][state]+v_tab[p-1][pstate];
8551 if (pstate==0 ||(v>best_pstate_v) ){best_pstate_v=v;best_pstate_p=pstate;}
8554 v_tab[p][state]=best_pstate_v;
8555 v_tab_p[p][state]=best_pstate_p;
8556 if (state==0 ||best_pstate_v>best_state_v ){best_state_p=state; best_state_v=best_pstate_v;}
8562 for (p=0; p< A->len_aln; p++)T->seq_al[s][p]='.';
8563 for (p=A->len_aln; p>0; p--)
8565 if ( best_state_p==0)T->seq_al[s][p-1]=toupper(translate_dna_codon (A->seq_al[s]+(p-1), 'x'));
8566 else if ( best_state_p>=SPL1 && best_state_p<=SPL4)T->seq_al[s][p-1]='-';
8567 best_state_p=v_tab_p[p][best_state_p];
8577 Alignment * mutate_cdna_aln ( Alignment *A)
8582 int neutral_substitution=50;
8583 int random_substitution=0;
8584 int random_deletion=0;
8585 int amino_acid_deletion=0;
8586 int amino_acid_substitution=0;
8587 char nuc_list[]="agct";
8590 neutral_substitution=atoi(get_env_variable ("NEUTRAL_SUBSTITUTION",IS_FATAL));
8591 random_substitution =atoi(get_env_variable ("RANDOM_SUBSTITUTION", IS_FATAL));
8592 random_deletion =atoi(get_env_variable ("RANDOM_DELETION", IS_FATAL));
8593 amino_acid_deletion =atoi(get_env_variable ("AMINO_ACID_DELETION", IS_FATAL));
8594 amino_acid_substitution =atoi(get_env_variable ("AMINO_ACID_SUBSTITUTION", IS_FATAL));
8597 if (A->S)free_sequence ( A->S, (A->S)->nseq);
8600 addrandinit(time (NULL));
8603 pos=aln2pos_simple ( A, A->nseq);
8605 /* 1 Apply neutral substitutions */
8607 if ( neutral_substitution)
8609 for ( c=0; c< neutral_substitution; c++)
8611 for ( a=0; a< A->nseq; a++)
8614 for ( b=0; b< A->len_aln; b++)
8617 if (pos[a][b]<=0)continue;
8618 ps=MAX(0,pos[a][b]-(pos[a][b]-1)%3-1);
8621 n1=(A->S)->seq[a][pos[a][b]-1];
8622 r1=translate_dna_codon ( (A->S)->seq[a]+ps, 'o');
8624 n2=nuc_list[(int)addrand((unsigned long) 4)];
8625 (A->S)->seq[a][pos[a][b]-1]=n2;
8626 r2=translate_dna_codon ( (A->S)->seq[a]+ps, 'o');
8629 if ( r1==r2 && r1!='o')A->seq_al[a][b]=n2;
8631 else (A->S)->seq[a][pos[a][b]-1]=n1;
8637 /* 2 Apply substitutions */
8638 if ( random_substitution)
8640 for ( a=0; a< A->nseq; a++)
8642 for ( b=0; b< A->len_aln; b++)
8644 if (pos[a][b]<=0)continue;
8645 if (addrand ((unsigned long) 100)>random_substitution)continue;
8647 n1=nuc_list[(int)addrand((unsigned long)4)];
8648 (A->S)->seq[a][pos[a][b]-1]=n1;
8654 /* 3 Apply amino acid substitutions */
8655 if ( amino_acid_substitution)
8657 for ( a=0; a< A->nseq; a++)
8659 for ( b=0; b< A->len_aln; b+=3)
8661 if (pos[a][b]<=0)continue;
8662 if (addrand ((unsigned long) 100)>amino_acid_substitution)continue;
8663 ps=MAX(0,pos[a][b]-(pos[a][b]-1)%3-1);
8665 r1=translate_dna_codon ( (A->S)->seq[a]+ps, 'o');
8666 new_codon=mutate_amino_acid(r1, "clustalw_col");
8668 for ( c=ps; c<ps+3; c++)(A->S)->seq[a][c]=new_codon[c-ps];
8670 for ( b=0; b< A->len_aln; b++)
8672 if (pos[a][b]<=0)continue;
8673 else A->seq_al[a][b]=(A->S)->seq[a][pos[a][b]-1];
8677 /* 3 Apply amino acid deletions */
8678 if ( amino_acid_deletion)
8680 for ( a=0; a< A->nseq; a++)
8682 for ( b=0; b< A->len_aln; b+=3)
8684 if (pos[a][b]<=0)continue;
8685 if (addrand ((unsigned long) 1000)>amino_acid_deletion)continue;
8686 ps=MAX(0,pos[a][b]-(pos[a][b]-1)%3-1);
8687 n=addrand ((unsigned long) 4)+1;
8689 for ( c=ps; c<ps+(3*n) && c<A->len_aln; c++)(A->S)->seq[a][c]='-';
8691 for ( b=0; b< A->len_aln; b++)
8693 if (pos[a][b]<=0)continue;
8694 else A->seq_al[a][b]=(A->S)->seq[a][pos[a][b]-1];
8698 /* 4 Apply amino acid insertions */
8700 /*FRAMESHIFT MUTATIONS*/
8701 /* 5 Apply nucleotide deletions*/
8702 if ( random_deletion)
8704 for ( a=0; a< A->nseq; a++)
8706 for ( b=0; b< A->len_aln; b++)
8708 if (pos[a][b]<=0)continue;
8709 if (addrand ((unsigned long) 1000)>random_deletion)continue;
8712 (A->S)->seq[a][pos[a][b]-1]=n1;
8717 /* 6 Apply nucleotide deletions*/
8723 Alignment* clean_est ( Alignment *A)
8725 /*Rules are as follow:
8726 Internal Gap > 30% Requences ----> -
8727 Best Residue < 50% Residues ----> 'N'
8733 for ( a=0; a< A->len_aln; a++)
8736 for (tot=0, b=0; b<4; b++)tot+=(A->P)->count[b][a];
8737 best=best_int (5,1, &c, (A->P)->count[0][a],(A->P)->count[1][a],(A->P)->count[2][a],(A->P)->count[3][a],(A->P)->count[4][a]);
8741 fprintf ( stderr, "\nWARNING: POSITION WITH NO INFORMATION [clean_est:%s]", PROGRAM);
8742 A->seq_al[0][a]='-';
8744 else if (((A->P)->count[4][a]*100)/tot >30)A->seq_al[0][a]='-';
8745 else if ( (best*100)/tot<50)A->seq_al[0][a]='n';
8753 char **make_symbols ( char *name, int *n)
8757 symbol=declare_char ( STRING, STRING);
8759 if ( strcmp (name, "3d_ali")==0)
8761 sprintf ( symbol[0], "gih");
8762 sprintf ( symbol[1], "eb");
8763 sprintf ( symbol[2], "x");
8764 sprintf ( symbol[3], "#l");
8768 else if ( strcmp (name, "all")==0)
8771 for ( i=0,a=0; a<26; a++)
8773 sprintf ( symbol[i++], "%c%c", 'a'+a, 'a'+a);
8774 sprintf ( symbol[i++], "%c%c", 'A'+a, 'A'+a);
8776 sprintf ( symbol[i++], "--");
8780 else if ( strcmp (name, "set1")==0)
8782 sprintf ( symbol[0], "ilvmfywhktcagH");
8783 sprintf ( symbol[1], "reqdnsP");
8784 sprintf ( symbol[2], "--");
8785 sprintf ( symbol[3], "#l");
8788 else if ( strcmp (name, "set2")==0)
8791 sprintf ( symbol[n[0]++], "gsacT");
8792 sprintf ( symbol[n[0]++], "ndtvpS");
8793 sprintf ( symbol[n[0]++], "ilkreqL");
8794 sprintf ( symbol[n[0]++], "--");
8795 sprintf ( symbol[n[0]++],"#l");
8797 else if ( strcmp ( name, "any")==0)
8799 sprintf ( symbol[0], "*x");
8809 char * translate_dna_seq_on3frame ( char *dna_seq, char stop, char *prot)
8815 if ( prot==NULL)prot=vcalloc ( l+2, sizeof (char));
8817 buf=vcalloc (l+4, sizeof (char));
8818 sprintf (buf, "%s", dna_seq);
8819 lower_string ( buf);
8820 for ( a=0; a< l; a++)buf[a]=(buf[a]=='t')?'u':buf[a];
8822 for (a=0; a< l; a++)
8823 prot[a]=translate_dna_codon (buf+a, stop);
8829 char * translate_dna_seq ( char *dna_seq, int frame, char stop, char *prot)
8835 if ( prot==NULL)prot=vcalloc ( l/3 +2, sizeof (char));
8837 buf=vcalloc (l+4, sizeof (char));
8838 sprintf (buf, "%s", dna_seq);
8839 lower_string ( buf);
8840 for ( a=0; a< l; a++)buf[a]=(buf[a]=='t')?'u':buf[a];
8842 for ( b=0,a=0+frame; a< l; a+=3,b++)
8843 prot[b]=translate_dna_codon (buf+a, stop);
8849 char * back_translate_dna_codon ( char aa, int deterministic)
8855 if ( r==NULL)r=vcalloc (4, sizeof (char));
8857 if (!is_gap(aa))aa=tolower(aa);
8859 if (is_gap(aa))sprintf (r, "---");
8860 else if ( aa>=0 && aa<=9)
8862 sprintf (r, "%d%d%d", aa, aa,aa);
8864 else if ( aa>='0' && aa<='9')
8866 sprintf (r, "%c%c%c", aa, aa,aa);
8870 choice=(deterministic)?0:rand()%4;
8871 if ( choice==0)sprintf (r, "gca");
8872 else if ( choice==1)sprintf (r, "gcg");
8873 else if ( choice==2)sprintf (r, "gcc");
8874 else if ( choice==3)sprintf (r, "gct");
8878 choice=(deterministic)?0:rand()%2;
8879 if ( choice==0)sprintf (r, "tgc");
8880 else if ( choice==1)sprintf (r, "tgt");
8884 choice=(deterministic)?0:rand()%2;
8885 if ( choice==0)sprintf (r, "gac");
8886 else if ( choice==1)sprintf (r, "gat");
8891 choice=(deterministic)?0:rand()%2;
8892 if ( choice==0)sprintf (r, "gaa");
8893 else sprintf (r, "gag");
8897 choice=(deterministic)?0:rand()%2;
8898 if ( choice==0)sprintf (r, "ttc");
8899 else sprintf (r, "ttt");
8903 choice=(deterministic)?0:rand()%4;
8904 if ( choice==0) sprintf (r, "gga");
8905 else if ( choice==1) sprintf (r, "ggg");
8906 else if ( choice==2) sprintf (r, "ggc");
8907 else if ( choice==3) sprintf (r, "ggt");
8912 if ( choice==0)sprintf (r, "cac");
8913 else sprintf (r, "cat");
8917 choice=(deterministic)?0:rand()%3;
8918 if ( choice==0) sprintf (r, "ata");
8919 else if ( choice==1) sprintf (r, "atc");
8920 else if ( choice==2) sprintf (r, "att");
8924 choice=(deterministic)?0:rand()%2;
8925 if ( choice==0) sprintf (r, "aaa");
8926 else if ( choice==1) sprintf (r, "aag");
8931 choice=(deterministic)?0:rand()%6;
8932 if ( choice==0) sprintf (r, "cta");
8933 else if ( choice==1) sprintf (r, "ctg");
8934 else if ( choice==2) sprintf (r, "ctc");
8935 else if ( choice==3) sprintf (r, "ctt");
8936 else if ( choice==4) sprintf (r, "tta");
8937 else if ( choice==5) sprintf (r, "ttg");
8939 else if ( aa=='m')sprintf ( r, "atg");
8942 choice=(deterministic)?0:rand()%2;
8943 if ( choice==0) sprintf (r, "aac");
8944 else if ( choice==1) sprintf (r, "aat");
8948 choice=(deterministic)?0:rand()%4;
8949 if ( choice==0) sprintf (r, "cca");
8950 else if ( choice==1) sprintf (r, "ccg");
8951 else if ( choice==2) sprintf (r, "ccc");
8952 else if ( choice==3) sprintf (r, "cct");
8956 choice=(deterministic)?0:rand()%2;
8957 if ( choice==0) sprintf (r, "caa");
8958 else if ( choice==1) sprintf (r, "cag");
8962 choice=(deterministic)?0:rand()%6;
8963 if ( choice==0) sprintf (r, "cga");
8964 else if ( choice==1) sprintf (r, "cgg");
8965 else if ( choice==2) sprintf (r, "cgc");
8966 else if ( choice==3) sprintf (r, "cgt");
8967 else if ( choice==4) sprintf (r, "aga");
8968 else if ( choice==5) sprintf (r, "agg");
8973 choice=(deterministic)?0:rand()%6;
8974 if ( choice==0) sprintf (r, "tca");
8975 else if ( choice==1) sprintf (r, "tcg");
8976 else if ( choice==2) sprintf (r, "tcc");
8977 else if ( choice==3) sprintf (r, "tct");
8978 else if ( choice==4) sprintf (r, "agt");
8979 else if ( choice==5) sprintf (r, "agc");
8984 choice=(deterministic)?0:rand()%4;
8985 if ( choice==0) sprintf (r, "aca");
8986 else if ( choice==1) sprintf (r, "acg");
8987 else if ( choice==2) sprintf (r, "acc");
8988 else if ( choice==3) sprintf (r, "act");
8992 choice=(deterministic)?0:rand()%4;
8993 if ( choice==0) sprintf (r, "gta");
8994 else if ( choice==1) sprintf (r, "gtg");
8995 else if ( choice==2) sprintf (r, "gtc");
8996 else if ( choice==3) sprintf (r, "gtt");
9004 choice=(deterministic)?0:rand()%2;
9005 if ( choice==0) sprintf (r, "tac");
9006 else if ( choice==1) sprintf (r, "tat");
9015 int translate_dna_codon ( char *sequence, char stop)
9021 if ( (b=strlen (sequence))<3)
9023 for ( a=0; a<b; a++)
9024 if ( !is_gap(sequence[a]))return 'x';
9029 seq[0]=tolower(sequence[0]);
9030 seq[1]=tolower(sequence[1]);
9031 seq[2]=tolower(sequence[2]);
9034 seq[0]=(seq[0]=='u')?'t':seq[0];
9035 seq[1]=(seq[1]=='u')?'t':seq[1];
9036 seq[2]=(seq[2]=='u')?'t':seq[2];
9042 if ( is_gap(seq[0])||is_gap(seq[1]) || is_gap(seq[2]))return '-';
9043 else if ( strm5(seq, "gca", "gcg", "gcc", "gct","gcn"))return 'a';
9044 else if ( strm2(seq, "tgc","tgt"))return 'c';
9045 else if ( strm2(seq, "gac","gat"))return 'd';
9046 else if ( strm2(seq, "gaa","gag"))return 'e';
9047 else if ( strm2(seq, "ttc","ttt"))return 'f';
9048 else if ( strm5(seq, "gga","ggg","ggc", "ggt", "ggn"))return 'g';
9049 else if ( strm2(seq, "cac","cat"))return 'h';
9050 else if ( strm3(seq, "ata","atc","att"))return 'i';
9051 else if ( strm2(seq, "aaa","aag"))return 'k';
9052 else if ( strm6(seq, "cta","ctg","ctc", "ctt", "tta", "ttg"))return 'l';
9053 else if ( strm (seq, "ctn"))return 'l';
9054 else if ( strm (seq, "atg"))return 'm';
9055 else if ( strm2(seq, "aac","aat"))return 'n';
9056 else if ( strm5(seq, "cca","ccg","ccc", "cct","ccn"))return 'p';
9057 else if ( strm2(seq, "cag","caa"))return 'q';
9058 else if ( strm6(seq, "cga","cgg","cgc", "cgt","aga","agg"))return 'r';
9059 else if ( strm (seq, "cgn"))return 'r';
9060 else if ( strm6(seq, "tca","tcg","tcc", "tct","agc","agt"))return 's';
9061 else if ( strm (seq, "ccn"))return 's';
9062 else if ( strm5(seq, "aca","acg","acc", "act", "acn"))return 't';
9063 else if ( strm5(seq, "gta","gtg","gtc", "gtt", "gtn"))return 'v';
9064 else if ( strm (seq, "tgg"))return 'w';
9065 else if ( strm2(seq, "tac","tat"))return 'y';
9066 else if ( strm3(seq, "tag","taa","tga"))return stop;
9067 else if ( seq[0]=='n' || seq[1]=='n' || seq[2]=='n') return stop;
9070 fprintf ( stderr, "\n%s is an unknown codon [FATAL:%s]",seq, PROGRAM);
9071 myexit (EXIT_FAILURE);
9076 Alignment * mutate_aln ( Alignment *A, char *r)
9078 int a, b, c, mut,type, ratio;
9086 if ( r[0]=='\0')ratio=0.01*RAND_MAX;
9087 else ratio=atof(r)*RAND_MAX;
9090 S=get_sequence_type(S);
9094 if ( strm(S->type, "DNA") || strm(S->type, "RNA"))sprintf (alp, "AGCT");
9095 else if ( strm(S->type, "PROTEIN"))sprintf (alp, "ACDEFGHIKLMNPQRSTVWY");
9097 alp_size=strlen(alp);
9099 B=copy_aln (A,NULL);
9100 B=realloc_aln(B, B->len_aln*2+1);
9102 for ( a=0, b=0; a< A->len_aln; a++, b+=2)
9104 for ( c=0; c< A->nseq; c++)
9106 B->seq_al[c][b]=tolower(A->seq_al[c][a]);
9107 B->seq_al[c][b+1]='~';
9111 for ( c=0; c< A->nseq; c++)B->seq_al[c][b]='\0';
9112 B->len_aln=A->len_aln*2;
9117 for (a=0; a< B->len_aln; a+=2)
9118 for ( b=0; b<B->nseq; b++)
9120 if ( is_gap(B->seq_al[b][a]))continue;
9121 mut=((rand()%RAND_MAX)>ratio)?0:1;
9128 if (type==0)/*deletion*/
9130 B->seq_al[b][a]='.';
9134 B->seq_al[b][a+1]=alp[rand()%alp_size];
9138 B->seq_al[b][a]=alp[rand()%alp_size];
9146 free_sequence (S, S->nseq);
9152 char* mutate_amino_acid ( char aa, char *mode)
9156 char nucleotide[]="agct";
9157 char amino_acid[]="acdefghiklmnpqrstvwy";
9158 static char **triplet;
9159 static char **cw_col;
9161 static int **amino_acid_list;
9168 if ( !mode)sprintf (mat, "clustalw_col");
9169 else sprintf (mat, "%s", mode);
9172 triplet=declare_char ( 64, 4);
9173 for (d=0, a=0; a< 4;a++)
9174 for ( b=0; b< 4; b++)
9175 for ( c=0; c< 4; c++, d++)
9177 triplet[d][0]=nucleotide[a];
9178 triplet[d][1]=nucleotide[b];
9179 triplet[d][2]=nucleotide[c];
9182 if ( !cw_col)cw_col=make_group_aa ( &ng_cw_col,mat);
9183 if ( !amino_acid_list)
9185 amino_acid_list=declare_int ( 20, 65);
9186 for ( a=0; a< 20; a++)
9187 for ( b=0; b< 64; b++)
9189 a1=translate_dna_codon ( triplet[b], 'x');
9191 for ( d=0; d< ng_cw_col; d++)
9192 if ( is_in_set ( a1, cw_col[d]) && is_in_set ( a2, cw_col[d]))
9194 amino_acid_list[a][++amino_acid_list[a][0]]=b;
9197 lu=vcalloc ( 26, sizeof (int));
9198 for ( a=0; a<20; a++)
9200 lu[amino_acid[a]-'a']=a;
9203 for ( a=0; a< 20; a++)
9205 fprintf ( stderr, "\n%c", amino_acid[a]);
9206 for ( b=1; b<=amino_acid_list[a][0]; b++)
9207 fprintf ( stderr, "\n\t%s %c", triplet[amino_acid_list[a][b]], translate_dna_codon (triplet[amino_acid_list[a][b]], 'x'));
9212 return triplet [addrand((unsigned long)amino_acid_list[lu[aa-'a']][0])+1];
9215 /**************************************************************************************************/
9216 /******************************** ********************************************/
9217 /******************************** PROCESSING ********************************************/
9218 /******************************** ********************************************/
9222 void modify_data (Sequence_data_struc *D1in, Sequence_data_struc *D2in, Sequence_data_struc *DSTin, char **action_list,int n_actions, Action_data_struc *RAD)
9224 Sequence *COOR=NULL, *NS=NULL,*BUFS=NULL, *OUT_S=NULL;
9225 Constraint_list *CL;
9227 int value,upper_value, lower_value, start, end, a, b,c;
9228 int *count_table=NULL;
9230 Sequence_data_struc *D1;
9231 Sequence_data_struc *D2;
9232 Sequence_data_struc *DST;
9234 static int clean_flag;
9239 action=action_list[0];
9249 else if ( action[0]=='1')
9256 else if ( action[0]=='3')
9269 if (!D1->A)D1->A=copy_aln (D1in->A, NULL);
9271 if ( strm(action, "seqnos"))
9273 (D1->A)->output_res_num=1;
9275 else if ( strm (action,"aln2bootstrap"))
9277 (D1->A)=aln2bootstrap (D1->A, ATOI_ACTION (1));
9278 D1->S=aln2seq (D1->A);
9280 else if ( strm (action,"aln2sample"))
9282 (D1->A)=aln2sample (D1->A, ATOI_ACTION (1));
9283 D1->S=aln2seq (D1->A);
9285 else if ( strm (action,"aln2random_aln"))
9287 (D1->A)=aln2random_aln (D1->A, ACTION (1));
9288 D1->S=aln2seq (D1->A);
9290 else if ( strm (action, "or_scan"))
9293 D1->A=or_scan(D1->A, D2->A, ACTION(1));
9294 D1->S=aln2seq (D1->A);
9296 else if ( strm (action, "or_sar"))
9298 D1->A=or_sar(D1->A, D2->A, ACTION(1), PRINT);
9299 D1->S=aln2seq (D1->A);
9301 else if ( strm ( action, "sar2subsar"))
9306 Alignment *subA, *subS;
9310 fprintf ( stderr, "\nin=aln, in2=sar sar2subsar [filter value compound1 compound2...] | [jack1] | [file]\n");
9311 myexit (EXIT_FAILURE);
9314 sarset2subsarset ( D1->A, D2->A, &subA, &subS, main_read_aln (action_list[2], NULL));
9315 D1->A=subA;D2->A=subS;
9317 else if ( strm (action, "display_sar"))
9319 D1->A=display_sar (D1->A, D2->A, action_list[1]);
9321 else if ( strm ( action, "sar2simpred"))
9326 sar2simpred ( D1->A, D2->A, action_list[1], action_list[2], atoi(action_list[3]), atoi (action_list[4]));
9328 else if ( strm ( action, "sar2simpred2"))
9335 fprintf ( stderr, "\nERROR: +sar2simpred2 seqnamesfile posfile compound limit");
9336 myexit (EXIT_FAILURE);
9338 sar2simpred2 ( D1->A, D2->A, action_list[1], action_list[2], action_list[3], atoi (action_list[4]));
9340 else if ( strm ( action, "sar_analyze"))
9345 sar_analyze ( D1->A, D2->A,action_list[1]);
9347 else if ( strm ( action, "simple_sar_predict"))
9349 //displays each column with ist score;
9350 simple_sar_predict (D1->A, D2->A,ACTION(1));
9351 exit (EXIT_SUCCESS);
9353 else if ( strm ( action, "display_sar_analyze"))
9355 //displays each column with ist score;
9356 display_simple_sar_analyze_col (D1->A, D2->A,ACTION(1));
9357 exit (EXIT_SUCCESS);
9359 else if ( strm ( action, "display_sar_analyze_pc"))
9361 //displays each column with ist score;
9362 display_simple_sar_analyze_pair_col (D1->A, D2->A,ACTION(1));
9363 exit (EXIT_SUCCESS);
9365 else if ( strm ( action, "weight2sar"))
9372 fprintf ( stderr, "\nERROR: +weight2sar <weight_file> <limit>");
9373 myexit (EXIT_FAILURE);
9375 D1->A=weight2sar ( D1->A,D2->A, action_list[1], atoi(action_list[2]));
9378 else if ( strm ( action, "sar_weight"))
9385 fprintf ( stderr, "\nERROR: +sar_weight <sar_analyze> <compound>");
9386 myexit (EXIT_FAILURE);
9388 D1->A=aln2weighted_sar_score ( D1->A,D2->A, action_list[1], action_list[2]);
9389 D1->S=aln2seq ( D1->A);
9392 else if ( strm (action, "name2unique_name"))
9396 tmp1=vtmpnam (NULL); tmp2=vtmpnam (NULL);
9398 output_fasta_aln (tmp1,D1->A);
9399 free_aln (D1->A);free_sequence (D1->S, -1);
9400 sprintf ( command, "fasta_aln2fasta_aln_unique_name.pl %s >%s", tmp1, tmp2);
9401 my_system ( command);
9402 D1->S=get_fasta_sequence ( tmp2, NULL);
9403 D1->A=seq2aln (D1->S,NULL, 1);
9405 else if ( strm (action, "rm_tag") || strm (action, "rm_template"))
9408 char **temp_name=NULL,**temp_list=NULL, temp_nseq=0;
9411 if ( D1 && D1->A){temp_name=(D1->A)->name;temp_nseq=(D1->A)->nseq;}
9412 else if ( D1 && D1->S){temp_name=(D1->S)->name;temp_nseq=(D1->S)->nseq;}
9413 temp_list=rm_name_tag (temp_name,temp_nseq, NULL);
9414 if ( n_actions>1 && strm (action_list[1], "template"))
9417 for ( z=0; z<temp_nseq; z++)
9419 if (temp_list[z][0])
9420 {fprintf (stdout, "%s\n", temp_list[z]);}
9422 myexit (EXIT_SUCCESS);
9425 else if (strm (action, "add_template") || strm (action, "swap_header"))
9427 D1->S=seq2template_seq (D1->S, action_list[1], NULL);
9428 D1->A=seq2aln(D1->S, NULL, 1);
9430 else if ( strm ( action, "seq2year"))
9432 D1->S=seq2year (D1->S, (n_actions>1)?atoi(action_list[1]):1);
9433 D1->A=seq2aln(D1->S, NULL, 1);
9435 else if ( strm (action, "swap_lib_header"))
9438 S=main_read_seq (action_list[1]);
9442 else if ( strm (action, "weight_lib"))
9446 w=atoi (action_list[1]);
9449 for (l=0; l<(D1->CL)->ne; l++)
9450 (D1->CL)->L[l*CL->entry_len+WE]=w;
9453 else if ( strm (action, "struc2nb"))
9456 for ( c=0; c< (D1->S)->nseq; c++)
9458 struclist2nb ((D1->S)->name[c],(D1->S)->seq[c], (D1->S)->seq_comment[c], atof(action_list[1]),ACTION(2),ACTION(3) );
9460 myexit (EXIT_SUCCESS);
9465 else if ( strm(action, "seq2contacts"))
9468 D1->S=swap_header (D1->S, D2->S);
9469 for ( z=0; z< (D1->S)->nseq; z++)sprintf ( (D1->A)->name[z], "%s", (D1->S)->name[z]);
9470 DST->S=seq2contacts (D1->S, atof (action_list[1]));
9471 DST->A=copy_aln (D1->A, NULL);
9472 thread_seq_struc2aln ( DST->A,DST->S);
9473 for (z=0; z< (D1->S)->nseq; z++)
9477 else if ( strm(action, "struc2contacts"))
9480 if ( atof (action_list[3])>0)
9482 seq=map_contacts (action_list[1], action_list[2], atof (action_list[3]));
9483 fprintf ( stderr, "\n>%s %s\n%s",action_list[1], action_list[2],seq);
9486 print_contacts (action_list[1], action_list[2], atof (action_list[3]));
9488 myexit (EXIT_SUCCESS);
9490 else if ( strm(action, "treelist_prune")|| strm(action, "prune_treelist"))
9493 if (D2 && D2->S)TS=D2->S;
9494 else TS=treelist2sub_seq((D1->S),ATOI_ACTION(1));
9495 treelist2prune_treelist ( D1->S,TS, NULL);
9496 D1->A=seq2aln (D1->S, NULL, NO_PAD);
9498 else if ( strm (action, "tree2unresolved_nodes"))
9502 ns=tree2nseq (D1->T);
9503 l=vcalloc (ns, sizeof (int));
9504 tree2nnode_unresolved (D1->T, l);
9505 for ( a=0; a<ns; a++)if (l[a])fprintf ( stdout, "SIZE: %d COUNT: %d\n", a, l[a]);
9507 exit (EXIT_SUCCESS);
9509 else if ( strm(action, "tree_prune") || strm(action, "prune_tree"))
9511 D1->T=main_prune_tree ( D1->T, D2->S);
9513 else if ( strm ( action, "tree2seq"))
9515 D1->S=tree2seq(D1->T, NULL);
9516 D1->A=seq2aln (D1->S, D1->A, 1);
9518 for ( a=0; a< (D1->A)->nseq; a++)sprintf ( (D1->A)->seq_al[a], "sequence");
9520 else if ( strm (action, "seq2dpatree"))
9522 D1->T= seq2dpa_tree(D1->S,"ktup");
9524 else if ( strm (action, "tree2dpatree"))
9526 D1->T= tree2dpa_tree(D1->T,(D2 && D2->A)?D2->A:D1->A, (n_actions==1)?"idmat":action_list[1]);
9528 else if ( strm (action, "tree2group"))
9530 vfclose (tree2group (D1->T, (tree2seq(D1->T,NULL)), atoi(action_list[1]), atoi(action_list[2]),(n_actions==4)?action_list[3]:NULL, stdout));
9531 myexit (EXIT_SUCCESS);
9533 else if ( strm(action, "unroot"))
9535 D1->T=unroot_tree(D1->T);
9539 else if ( strm(action, "treelist2group")|| strm(action, "treelist2groups") )
9543 if (D2 && D2->S)TS=D2->S;
9544 else TS=treelist2seq((D1->S));
9545 treelist2groups (D1->S, TS, ACTION(1), stdout);
9546 myexit (EXIT_SUCCESS);
9548 // treelist2groups (D1->S,(D2)?D2->S:NULL, ACTION(1), stdout );
9549 //exit (EXIT_SUCCESS);
9551 else if ( strm(action, "splits2tree"))
9554 D1->T=split2tree ((D2)?D2->T:NULL,D1->S, ACTION(1));
9557 else if ( strm(action, "count_splits"))
9560 count_splits ((D2)?D2->T:NULL,D1->S, ACTION(1));
9561 exit (EXIT_SUCCESS);
9563 else if ( strm(action, "count_groups"))
9565 count_tree_groups (D1->S, ACTION(1));
9567 else if ( strm (action, "tree2dist"))
9573 td=tree2dist (D1->T,TS, NULL);
9574 if (!TS)TS=tree2seq(D1->T, NULL);
9575 for (ta=0; ta<TS->nseq; ta++)
9577 fprintf ( stdout, "%-15s ",TS->name[ta]);
9578 for ( tb=0; tb<TS->nseq; tb++)
9581 if ( ACTION(1) && strm (ACTION(1), "length"))n=1;
9583 fprintf (stdout, " %4d", td [n][ta][tb]);
9585 fprintf ( stdout, "\n");
9587 exit (EXIT_SUCCESS);
9589 else if ( strm (action, "treelist2lti"))
9592 if (D2 && D2->S)TS=D2->S;
9593 else TS=treelist2sub_seq((D1->S),ATOI_ACTION(2));
9594 treelist2lti (D1->S,TS, (int)ATOI_ACTION(1), stdout );
9597 else if ( strm (action,"treelist2frame"))
9600 if (D2 && D2->S)TS=D2->S;
9601 else TS=treelist2sub_seq((D1->S),ATOI_ACTION(1));
9602 treelist2frame (D1->S, TS);
9603 myexit (EXIT_SUCCESS);
9606 else if ( strm (action, "treelist2seq"))
9608 D1->S=treelist2sub_seq (D1->S,ATOI_ACTION(1));
9609 D1->A=seq2aln(D1->S, NULL, 1);
9611 else if ( strm (action, "treelist2leafgroup"))
9613 treelist2leafgroup (D1->S, (D2)?D2->S:NULL, ACTION(1));
9616 else if ( strm(action, "treelist2splits"))
9618 if (D1->T)D1->S=add_file2file_list ((D1->T)->file, NULL);
9619 treelist2splits (D1->S, (D2)?D2->S:NULL);
9622 else if ( strm(action, "treelist2dmat"))
9624 treelist2dmat (D1->S);
9626 else if ( strm(action, "tree_cmp") || strm (action, "tree_compare"))
9628 D1->T=main_compare_trees ( D1->T, D2->T, stdout);
9630 else if ( strm (action, "tree_scan"))
9632 D1->T=tree_scan (D1->A, D2->T, ACTION(1), ACTION(2));
9634 else if ( strm (action, "split_cmp"))
9636 main_compare_splits (D1->T, D2->T, ACTION(1), stdout);
9639 else if ( strm(action, "node_sort"))
9641 node_sort ( action_list[1], D1->T);
9642 exit (EXIT_SUCCESS);
9645 else if ( strm ( action, "avg_bootstrap"))
9647 display_avg_bootstrap (D1->T);
9648 myexit (EXIT_SUCCESS);
9651 else if ( strm (action, "tree_cog_cmp"))
9653 main_compare_cog_tree (D1->T,action_list[1]);
9654 exit (EXIT_SUCCESS);
9656 else if ( strm (action, "tree_aln_cmp"))
9658 main_compare_aln_tree (D1->T, D2->A, stdout);
9659 exit (EXIT_SUCCESS);
9661 else if ( strm(action, "change_bootstrap"))
9663 D1->T=reset_boot_tree ( D1->T, (n_actions>=2)?atoi(action_list[1]):0);
9665 else if ( strm(action, "change_distances"))
9667 D1->T=reset_dist_tree ( D1->T, (n_actions>=2)?atof(action_list[1]):0.00);
9670 else if ( strm(action, "aln2tree"))
9672 D1->T=tree_compute (D1->A, n_actions-1, action_list+1);
9674 else if ( strm(action, "similarities2tree"))
9676 D1->T=similarities_file2tree (ACTION(1));
9679 else if ( strm(action, "original_seqnos"))
9681 (D1->A)->output_res_num=2;
9684 else if ( strm (action, "aln2pred"))
9686 aln2pred (D1->A, D2->A, ACTION (1));
9687 exit (EXIT_SUCCESS);
9689 else if ( strm(action, "evaluate"))
9694 DST->A=copy_aln (D1->A, NULL);
9695 DST->S=aln2seq(DST->A);
9696 if (n_actions>1 && strm ( action_list[1], "categories"))
9698 CL=declare_constraint_list ( DST->S,NULL, NULL, 0,NULL, read_matrice("pam250mt"));
9699 DST->A= main_coffee_evaluate_output(DST->A, CL, "categories");
9701 else if (n_actions>1 && strm ( action_list[1], "sar"))
9703 CL=declare_constraint_list ( DST->S,NULL, NULL, 0,NULL, read_matrice("pam250mt"));
9704 DST->A= main_coffee_evaluate_output(DST->A, CL, "sar");
9706 else if (n_actions>1 && strstr ( action_list[1], "boxshade"))
9708 char color_mode[1000];
9709 sprintf (color_mode,"boxshade_%d", atoi(ACTION2(2,"30")));
9710 CL=declare_constraint_list ( DST->S,NULL, NULL, 0,NULL, read_matrice("pam250mt"));
9711 DST->A= main_coffee_evaluate_output(DST->A, CL, color_mode);
9716 CL=declare_constraint_list ( DST->S,NULL, NULL, 0,NULL, read_matrice((n_actions==1)?"pam250mt":action_list[1]));
9717 DST->A= main_coffee_evaluate_output(DST->A, CL, "matrix");
9720 DST->S=aln2seq ( DST->A);
9724 sprintf ( A->name[A->nseq], "cons");
9725 sprintf ( A->seq_al[A->nseq], "%s", aln2cons_seq_mat (A, "idmat"));
9728 else if ( strm (action, "sp_evaluate"))
9730 fprintf ( stdout, "SP Score: %.2f", sum_pair ((DST && DST->A)?DST->A:D1->A,ACTION(1),atoi(ACTION2(2,"0")),atoi(ACTION2(3,"0"))));
9731 exit (EXIT_SUCCESS);
9733 else if ( strm (action, "lat_evaluate"))
9736 score=lat_sum_pair ( D1->A, action_list[1]);
9737 fprintf ( stdout, "\nLAT_SCORE: %.2f", score);
9738 exit (EXIT_SUCCESS);
9741 else if ( strm (action, "add_scale"))
9743 D1->A=aln2scale (D1->A, ACTION(1));
9745 else if ( strm (action, "RNAfold_cmp"))
9747 D1->A=compare_RNA_fold (D1->A, D2->A);
9749 else if ( strm (action, "aln2alifold"))
9751 D1->A=aln2alifold (D1->A);
9752 D1->S=aln2seq ( D1->A);
9756 else if ( strm (action, "add_alifold"))
9758 D1->A=add_alifold2aln (D1->A, (D2)?D2->A:NULL);
9761 else if ( strm (action, "alifold2analyze"))
9763 D1->A=alifold2analyze (D1->A, (D2)?D2->A:NULL, ACTION(1));
9764 D1->S=aln2seq(D1->A);
9766 else if ( strm (action, "aln2conservation"))
9768 D1->A=aln2conservation ( D1->A, ATOI_ACTION (1), ACTION (2));
9769 exit (EXIT_FAILURE);
9771 else if ( strm (action, "aln2cons"))
9775 cons_name=vcalloc (100, sizeof (char));
9776 sprintf(cons_name, "%s", (n_actions<=2)?"Cons":action_list[2]);
9777 cons_seq=aln2cons_seq_mat (D1->A, (n_actions==1)?"blosum62mt":action_list[1]);
9778 free_aln (D1->A);free_sequence(D1->S, -1);
9779 D1->S=fill_sequence_struc (1, &cons_seq, &cons_name);
9781 (D1->S)->len[0]=strlen (cons_seq); sprintf ( (D1->S)->seq[0], "%s", cons_seq);
9782 D1->A=seq2aln (D1->S, NULL, KEEP_GAP);
9783 vfree (cons_name);vfree (cons_seq);
9785 else if ( strm (action, "seq2filter"))
9787 D1->S=seq2filter ( D1->S, atoi(action_list[1]), atoi(action_list[2]));
9790 else if ( strm (action, "aln2resindex"))
9792 //-in: aln, file: ref_seq ref_res target_seq
9793 //-in2 target sequences
9794 aln2resindex (D1->A, (D2)?D2->A:NULL, stdout);
9795 exit (EXIT_SUCCESS);
9797 else if (strm(action, "keep_name"))
9799 RAD->keep_name=1-RAD->keep_name;
9801 else if (strm(action, "use_consensus") ||strm(action, "use_cons") )
9803 RAD->use_consensus=1-RAD->use_consensus;
9805 else if ( strm(action, "ungap"))
9807 seq2aln (D1->S, D1->A, 1);
9809 else if ( strm2(action, "rmgap", "rm_gap"))
9812 ungap_aln_n (D1->A, (n_actions==1)?100:atoi(action_list[1]));
9813 free_sequence ( D1->S, (D1->S)->nseq);
9814 D1->S=aln2seq ( D1->A);
9817 else if ( strm(action, "rmgap_col"))
9819 D1->A=remove_gap_column ( D1->A,action_list[1]);
9821 else if ( strm(action,"random"))
9824 D1->A= make_random_aln(NULL,(n_actions==1)?1:atoi(action_list[1]),(n_actions==2)?100:atoi(action_list[2]),"acdefghiklmnpqrstvwy");
9826 D1->S=aln2seq ( D1->A);
9829 else if ( strm(action, "landscape"))
9832 set_landscape_msa ((n_actions==1)?0:atoi(action_list[1]));
9834 else if ( strm(action, "clean_maln"))
9838 fprintf ( stderr,"\n[You Need an evaluation File: Change the output format][FATAL:%s]\n", PROGRAM);
9839 myexit(EXIT_FAILURE);
9841 (DST->A)=aln2number (DST->A);
9842 D1->A=clean_maln(D1->A, DST->A,(n_actions==1)?1:atoi(action_list[1]),(n_actions==1)?1:atoi(action_list[2]));
9844 else if ( strm (action, "extract"))
9847 COOR=get_pir_sequence (RAD->coor_file, NULL);
9848 D1->S=extract_sub_seq ( COOR, D1->S);
9850 D1->A=declare_Alignment(D1->S);
9851 seq2aln (D1->S, D1->A, RAD->rm_gap);
9852 free_sequence (COOR, COOR->nseq);
9854 else if ( strm (action, "reorder_column"))
9859 Alignment *RO1, *RO2;
9863 RO1=rotate_aln (D1->A,NULL);
9864 if (ACTION(1) && strm (ACTION(1), "tree"))
9866 D1->T=tree_compute (RO1,n_actions-2, action_list+2);
9867 OUT_S=tree2seq(D1->T, NULL);
9868 RO1=reorder_aln(RO1, OUT_S->name, OUT_S->nseq);
9870 else if ( ACTION(1) && strm (ACTION(1), "random"))
9872 RO1=reorder_aln ( RO1, NULL, RO1->nseq);
9875 RO2=rotate_aln (RO1, NULL);
9876 for (s=0; s< RO2->nseq; s++)
9877 sprintf ( RO2->name[s], "%s", (D1->A)->name[s]);
9881 D1->S=aln2seq(D1->A);
9884 else if ( strm (action, "reorder"))
9887 if ( n_actions==2 && strm (action_list[1], "random"))
9889 D1->A=reorder_aln ( D1->A, NULL, (D1->A)->nseq);
9891 else if (n_actions==2 && strm (action_list[1], "scramble"))
9893 D1->A=aln2scramble_seq(D1->A);
9896 else if ( n_actions==2 && strm (action_list[1], "tree"))
9899 OUT_S=tree2seq (D2->T, NULL);
9900 D1->A=reorder_aln(D1->A, OUT_S->name, OUT_S->nseq);
9901 free_sequence (D1->S,(D1->S)->nseq);
9902 D1->S=aln2seq (D1->A);
9906 (D2->A)->S=aln2seq (D2->A);
9907 (D1->A)->S=aln2seq (D1->A);
9908 OUT_S=trim_aln_seq_name(D2->A, D1->A);
9909 D1->A=reorder_aln(D1->A, OUT_S->name, OUT_S->nseq);
9910 free_sequence (D1->S,(D1->S)->nseq);
9911 D1->S=aln2seq (D1->A);
9914 else if ( strm (action, "cat_aln"))
9916 /*D1->A=aln_cat ( D1->A, D2 ->A);*/
9918 if (D2 && D2->A && !ACTION(1))
9919 D1->A=concatenate_aln (D1->A, D2->A, ACTION(1));
9920 else if (ACTION(1) && is_aln(ACTION(1)))
9928 B=main_read_aln (ACTION(n), NULL);
9929 D1->A=concatenate_aln (D1->A, B, NULL);
9932 D1->S=aln2seq(D1->A);
9939 A=main_read_aln ((D1->A)->name[0], NULL);
9941 for ( a=1; a<(D1->A)->nseq; a++)
9943 B=main_read_aln ((D1->A)->name[a], NULL);
9944 A=concatenate_aln (A, B, ACTION(1));
9948 D1->S=aln2seq(D1->A);
9952 else if ( strm ( action, "msalist2cat_pwaln"))
9964 min=atoi(action_list[1]);
9965 max=atoi(action_list[2]);
9968 fprintf ( stdout, ">A\n");
9969 for (a=0;a<(D1->S)->nseq; a++)
9972 HERE ("process %s", (D1->S)->name[a]);
9973 A=main_read_aln((D1->S)->name[a],NULL);
9974 for (b=0; b<A->nseq-1; b++)
9976 for ( c=b+1; c<A->nseq; c++)
9978 sim=get_seq_sim (A->seq_al[b], A->seq_al[c], "-", "");
9979 if (sim>=min && sim<=max)fprintf (stdout, "xxx%s", A->seq_al[b]);
9984 fprintf ( stdout, "\n>B\n");
9985 for (a=0;a<(D1->S)->nseq; a++)
9988 HERE ("process %s", (D1->S)->name[a]);
9989 A=main_read_aln((D1->S)->name[a],NULL);
9990 for (b=0; b<A->nseq-1; b++)
9992 for ( c=b+1; c<A->nseq; c++)
9994 sim=get_seq_sim (A->seq_al[b], A->seq_al[c], "-", "");
9995 if (sim>=min && sim<=max)fprintf (stdout, "xxx%s", A->seq_al[c]);
10001 fprintf ( stdout, "\n");
10002 exit (EXIT_SUCCESS);
10005 else if ( strm (action, "collapse_tree"))
10007 D1->T=tree2collapsed_tree (D1->T, n_actions-1, action_list+1);
10009 else if ( strm (action, "collapse_aln"))
10011 D1->A=aln2collapsed_aln (D1->A, n_actions-1, action_list+1);
10013 else if ( strm (action, "extract_aln"))
10015 D1->A=aln2sub_aln_file (D1->A, n_actions-1, action_list+1);
10016 myexit (EXIT_SUCCESS);
10021 else if ( strm (action, "remove_aa"))
10024 pos=atoi(action_list[1]);
10025 len=atoi(action_list[2]);
10026 n=atoi (action_list[3]);
10027 if ( atoi (action_list[4])==1)len=-len;
10030 fprintf ( stderr, "\nWARNING: rm_aa, position (pos) and iteration number (n) simulatneously defined. Iteration number reset to 1 [%s]\n", PROGRAM);
10033 for ( a=0; a< n; a++)
10034 D1->A=probabilistic_rm_aa (D1->A, pos, len);
10036 else if ( strm (action, "remove_nuc"))
10039 pos=atoi(action_list[1]);
10041 if ( pos>3 || pos<1)
10042 printf_exit (EXIT_FAILURE, stderr, "Remove_nuc: indicate a number between 1 and 3\n");
10045 for ( c=0,a=0; a<(D1->A)->len_aln; a++, c++)
10048 for (b=0; b<(D1->A)->nseq; b++)
10052 (D1->A)->seq_al[b][a]='-';
10057 D1->S=aln2seq (D1->A);
10060 else if (strm ( action, "conserved_positions"))
10068 for ( a=0; a< A->nseq && !cache; a++)
10070 if ( strm (action_list[1], A->name[a]))
10072 cache=vcalloc ( A->len_aln+1, sizeof (int));
10073 for ( c=0,b=0; b<A->len_aln; b++)
10075 if ( is_gap (A->seq_al[a][b]))cache[b]=-1;
10081 for ( a=0; a< A->len_aln; a++)
10083 r1=A->seq_al[0][a];
10084 if ( is_gap(r1))continue;
10085 for ( c=0,b=0; b<A->nseq; b++)
10087 r2=A->seq_al[b][a];
10090 if ( (c*100)/A->nseq>=atoi(action_list[2]))
10091 fprintf ( stdout, "COL: %d Res: %c %s %d\n", a+1, r1, action_list[1], cache[a]+atoi(action_list[3]));
10093 exit (EXIT_FAILURE);
10095 else if (strm ( action, "extract_block") )
10098 BUF=copy_aln (D1->A, NULL);
10099 if ( check_file_exists(action_list[1]))
10100 BUF=extract_aln3(BUF,action_list[1]);
10102 BUF=extract_aln2(BUF,atoi(action_list[2]),atoi(action_list[3]),action_list[1]);
10103 D1->A=copy_aln (BUF,D1->A);
10106 else if ( strm ( action, "extract_pos_list"))
10108 D1->A=alnpos_list2block (D1->A, n_actions-1, action_list+1);
10110 else if ( strm ( action, "seq2msa"))
10112 D1->A=simple_progressive_aln ( D1->S, NULL, NULL, action_list[1]);
10114 else if ( strm ( action, "realign_block") )
10116 D1->A=realign_block ( D1->A, atoi (action_list[1]), atoi (action_list[2]), (n_actions==4)?action_list[3]:NULL);
10118 else if ( strm (action, "extract_seq"))
10121 if ( check_file_exists (action_list[1]))
10124 BUFS=main_read_seq (action_list[1]);
10125 action_list=BUFS->name;
10126 n_actions=BUFS->nseq;
10135 for ( a=0; a< n_actions;)
10139 if ( n_actions==1 || is_file==1)
10148 start=(strm2 (s,"#","*"))?1:(atoi(action_list[a+1]));
10149 end= (strm2 (action_list[a+2],"#","*"))?0:(atoi(action_list[a+2]));
10153 if ( strm2 (s, "#", "*"))
10155 OUT_S=extract_one_seq((D1->A)->name[0],start, end, D1->A, RAD->keep_name);
10156 for (b=1; b< (D1->A)->nseq; b++)
10158 NS=extract_one_seq((D1->A)->name[b],start, end, D1->A, RAD->keep_name);
10159 if (count_n_res_in_array(NS->seq[0], -1))
10160 OUT_S=add_sequence ( NS,OUT_S, 0);
10165 if ( a==1)OUT_S=extract_one_seq(s,start, end, D1->A, RAD->keep_name);
10168 NS=extract_one_seq(s,start, end, D1->A, RAD->keep_name);
10169 OUT_S=add_sequence ( NS,OUT_S, 0);
10175 D1->A=declare_Alignment(D1->S);
10176 seq2aln (D1->S, D1->A, RAD->rm_gap);
10179 else if ( strm (action, "extract_seq_list"))
10181 if ( check_file_exists (action_list[1]))
10184 BUFS=main_read_seq (action_list[1]);
10185 action_list=BUFS->name;
10186 n_actions=BUFS->nseq;
10194 for ( a=0; a< n_actions;a++)
10196 NS=extract_one_seq(action_list[a],1,0, D1->A, KEEP_NAME);
10197 OUT_S=add_sequence ( NS,OUT_S, 0);
10202 D1->A=declare_Alignment(D1->S);
10203 seq2aln (D1->S, D1->A, RAD->rm_gap);
10205 else if ( strm (action, "remove_seq") || strm (action, "rm_seq"))
10212 list=declare_char ((D1->S)->nseq, 200);
10214 buf=vcalloc ((D1->S)->max_len+1, sizeof (char));
10215 for ( n=0,a=0; a< (D1->A)->nseq; a++)
10218 sprintf (buf, "%s", (D1->S)->seq[a]);
10222 for (c=1, b=1; b< n_actions; b++)
10224 if ( strm (action_list[b], (D1->S)->name[a])){(D1->S)->seq[a]=NULL;break;}
10225 else if ( strm (action_list[b], "empty") && l==0)
10227 fprintf ( stderr, "WARNING: Sequence %s does not contain any residue: automatically removed from the set [WARNING:%s]\n",(D1->S)->name[a], PROGRAM);
10228 (D1->S)->seq[a]=NULL;break;
10230 else if ( strm (action_list[b], "unique"))
10232 if ( name_is_in_list ((D1->S)->name[a], list,n, 100)!=-1)
10234 (D1->S)->seq[a]=NULL;break;
10238 sprintf ( list[n++], "%s", (D1->S)->name[a]);
10243 D1->S=duplicate_sequence (D1->S);
10245 free_char ( list, -1);
10246 D1->A=declare_Alignment(D1->S);
10247 seq2aln (D1->S, D1->A, RAD->rm_gap);
10250 else if ( strm (action, "aln2overaln")|| strm (action,"overaln_param"))
10252 //mode (lower|number|uanlign) Penalty (0-100) Thresold (0-9)
10256 char clean_mode[100];
10259 F=vcalloc (1, sizeof (OveralnP));
10262 D1->A=mark_exon_boundaries (D1->A, D2->A);
10265 else if ( get_string_variable ("exon_boundaries"))
10269 EB=seq2aln(S=main_read_seq(s),NULL, 0);
10270 D1->A=mark_exon_boundaries (D1->A, EB);
10271 free_sequence (S, S->nseq); free_aln (EB);
10276 if (ACTION(1)==NULL)sprintf (F->mode, "lower");
10277 else if (strstr (ACTION(1), "h"))
10279 fprintf ( stdout, "aln2unalign lower|number|unalign F P1 P2 P3 T\n");
10280 exit (EXIT_SUCCESS);
10282 else sprintf (F->mode, "%s", ACTION(1));
10284 F->t=ATOI_ACTION(2);
10285 F->f=ATOI_ACTION(3);
10286 F->p1=ATOI_ACTION(4);
10287 F->p2=ATOI_ACTION(5);
10288 F->p3=ATOI_ACTION(6);
10289 F->p3=ATOI_ACTION(7);
10291 if (int_variable_isset ("overaln_target"))f=get_int_variable ("overaln_target");
10292 if (int_variable_isset ("overaln_threshold"))t=get_int_variable ("overaln_threshold");
10293 if (eb)sprintf (F->model, "fsa2");
10294 else sprintf (F->model, "fsa1");
10296 D1->A=aln2clean_pw_aln (D1->A, F);
10299 else if ( strm (action,"aln2unalign"))
10305 SA=copy_aln (D1->A, NULL);
10306 thread_seq_struc2aln (SA, SS);
10307 D1->A=unalign_aln (D1->A,SA, ATOI_ACTION(1));
10308 D1->S=aln2seq ( D1->A);
10310 else if ( strm (action, "clean_cdna"))
10314 for (a=0; a< A->nseq; a++)
10319 f=get_longest_frame (d, 3);
10320 buf=vcalloc ( strlen (d)+1, sizeof (char));
10321 sprintf (buf, "%s", d+f);
10322 sprintf (d, "%s", buf);
10326 else if ( strm (action, "clean_cdna2"))
10328 D1->A=clean_cdna_aln ( D1->A);
10329 free_sequence ( D1->S, (D1->S)->nseq);
10330 D1->S=aln2seq ( D1->A);
10332 else if ( strm (action, "aln2short_aln"))
10334 D1->A=aln2short_aln (D1->A, action_list[1], action_list[2], atoi(action_list[3]));
10335 free_sequence ( D1->S, (D1->S)->nseq);
10336 D1->S=aln2seq ( D1->A);
10338 else if ( strm ( action, "complement"))
10340 D1->A=complement_aln (D1->A);
10341 free_sequence ( D1->S, (D1->S)->nseq);
10342 D1->S=aln2seq ( D1->A);
10344 else if ( strm ( action, "translate"))
10346 D1->A=translate_dna_aln( D1->A,(n_actions==1)?0:atoi(action_list[1]));
10347 free_sequence ( D1->S, (D1->S)->nseq);
10348 D1->S=aln2seq ( D1->A);
10350 else if (strm2 ( action, "back_translate","backtranslate"))
10352 D1->A=back_translate_dna_aln( D1->A);
10353 free_sequence ( D1->S, (D1->S)->nseq);
10354 D1->S=aln2seq ( D1->A);
10356 else if (strm ( action, "rotate"))
10358 D1->A=rotate_aln( D1->A, action_list[1]);
10359 free_sequence ( D1->S, (D1->S)->nseq);
10360 D1->S=aln2seq ( D1->A);
10362 else if (strm ( action, "invert"))
10364 D1->A=invert_aln( D1->A);
10365 free_sequence ( D1->S, (D1->S)->nseq);
10366 D1->S=aln2seq ( D1->A);
10368 else if (strm ( action, "code_dna_aln"))
10370 D1->A=code_dna_aln( D1->A);
10371 free_sequence ( D1->S, (D1->S)->nseq);
10372 D1->S=aln2seq ( D1->A);
10375 else if ( strm ( action, "mutate"))
10377 D1->A=mutate_aln( D1->A,(n_actions==1)?"0":action_list[1]);
10378 free_sequence ( D1->S, (D1->S)->nseq);
10379 D1->S=aln2seq (D1->A);
10381 else if ( strm ( action, "thread_profile_on_msa"))
10384 D1->A=thread_profile_files2aln (D1->A, action_list[1], NULL);
10385 D1->S=aln2seq(D1->A);
10387 else if ( strm ( action, "thread_dna_on_prot_aln"))
10389 D1->A=thread_dnaseq_on_prot_aln (D1->S, D2->A);
10390 free_sequence (D1->S,(D1->S)->nseq);
10391 D1->S=aln2seq (D1->A);
10393 else if ( strm ( action, "thread_struc_on_aln"))
10395 thread_seq_struc2aln ( D2->A, D1->S);
10396 D1->A=copy_aln(D2->A, D1->A);
10397 free_sequence ( D1->S, (D1->S)->nseq);
10398 D1->S=aln2seq (D1->A);
10400 else if ( strm (action, "sim_filter"))
10402 D1->A=sim_filter (D1->A, action_list[1], ACTION (2));
10403 free_sequence (D1->S,(D1->S)->nseq);
10404 D1->S=aln2seq (D1->A);
10406 else if ( strm (action, "seq2blast"))
10408 D1->A=seq2blast (D1->S);
10409 free_sequence (D1->S,(D1->S)->nseq);
10410 D1->S=aln2seq (D1->A);
10413 else if ( strm (action, "trim"))
10415 D1->A=simple_trimseq (D1->A,(D2)?D2->A:NULL, action_list[1], ACTION (2));
10417 free_sequence (D1->S,(D1->S)->nseq);
10418 D1->S=aln2seq (D1->A);
10421 else if (strm ( action, "trimTC"))
10423 value=(n_actions==1)?10:atoi(action_list[1]);
10425 D1->A=tc_trimseq(D1->A,D1->S,action_list[1]);
10426 free_sequence (D1->S,(D1->S)->nseq);
10427 D1->S=aln2seq (D1->A);
10429 else if (strm ( action, "trimTC2"))
10433 char trim_mode[100];
10434 if ( n_actions==1 || !(strm (action_list[1], "NSEQ") ||strm (action_list[1], "MINID")) )
10436 fprintf ( stderr, "\nTrimTC2 <NSEQ | MINID> <number sequences| minimum identity> (<matrix>)\n");
10437 myexit (EXIT_FAILURE);
10439 sprintf (trim_mode, "%s", action_list[1]);action_list+=2; n_actions-=2;
10441 if ( strm ( trim_mode, "NSEQ"))
10443 group_file=tree2Ngroup( (D1)?D1->A:NULL, (D2)?D2->T:NULL, atoi (action_list[0]), vtmpnam(NULL), (n_actions==1)?"idmat":action_list[1]);
10447 group_file=tree2Ngroup( (D1)?D1->A:NULL, (D2)?D2->T:NULL, -1*atoi (action_list[0]), vtmpnam(NULL), (n_actions==1)?"idmat":action_list[1]);
10450 B=copy_aln (D1->A, B);
10451 B=aln2sub_aln_file (B,1,&group_file);
10452 B=aln2sub_seq (B, 1, &group_file);
10453 D1->A=extract_sub_aln2 (D1->A, B->nseq, B->name);
10455 else if ( strm (action, "chain"))
10457 D1->A=seq2seq_chain (D1->A,D2->A, ACTION(2));
10461 else if (strm ( action, "master_trim"))
10463 value=(n_actions==1)?10:atoi(action_list[1]);
10465 D1->A=master_trimseq(D1->A,D1->S,action_list[1]);
10466 free_sequence (D1->S,(D1->S)->nseq);
10467 D1->S=aln2seq (D1->A);
10469 else if ( strm (action, "force_aln"))
10471 char ***rlist=NULL;
10476 if (!is_lib_02(action_list[1]))
10478 fprintf ( stderr, "\nERROR: force_aln requires files in TC_LIB_FORMAT_02 [FATAL:%s]", PROGRAM);
10479 myexit (EXIT_FAILURE);
10482 rlist=file2list (action_list[1], " ");
10486 rlist=declare_arrayN(3, sizeof (char),3,7, 10);
10488 strcat (rlist[1][1],action_list[1]);strcat (rlist[1][3],action_list[2]);
10489 strcat (rlist[1][4],action_list[3]);strcat (rlist[1][6],action_list[4]);
10490 sprintf ( rlist[2][0], "-1");
10493 while (rlist[count] && atoi(rlist[count][0])!=-1)
10495 char st1[100], st2[100], st3[100], st4[100];
10497 sprintf ( st1, "%s", rlist[count][1]);sprintf ( st2, "%s", rlist[count][3]);
10498 sprintf ( st3, "%s", rlist[count][4]);sprintf ( st4, "%s", rlist[count][6]);
10499 fprintf ( stderr, "\nFORCE: %s %s %s %s", st1, st2, st3, st4);
10501 if (is_number (st1))s1=atoi (st1)-1;
10502 else s1=name_is_in_list (st1,(D1->A)->name, (D1->A)->nseq, 100);
10503 if ( s1<0 || s1>= (D1->A)->nseq)crash ("wrong sequence index");
10506 if (is_number (st3))s2=atoi (st3)-1;
10507 else s2=name_is_in_list (st3,(D1->A)->name, (D1->A)->nseq, 100);
10508 if ( s2<0 || s2>= (D1->A)->nseq)crash ("wrong sequence index");
10511 (D1->A)=add_constraint2aln ((D1->A), s1, r1, s2, r2);
10514 fprintf ( stderr, "\n");
10515 free_arrayN((void*)rlist,3);
10518 else if (strm ( action, "grep"))
10520 D1->A=grep_seq (D1->A, ACTION(1),ACTION(2), ACTION(3));
10521 if (D1->A==NULL) myexit (EXIT_SUCCESS);
10522 else D1->S=aln2seq (D1->A);
10525 else if (strm (action, "find"))
10528 char *search_string;
10530 search_string=vcalloc ( 30, sizeof (char));
10531 if ( strm (action_list[1], "lower"))sprintf ( search_string, "abcdefghijklmnopqrstuvwxyz");
10532 else if ( strm ( action_list[1], "upper"))sprintf ( search_string, "ABCDEFGHIJKLMNOPQRSTUVWXYZ");
10535 vfree (search_string);search_string=vcalloc ( strlen (action_list[1])+1, sizeof (char));
10536 sprintf (search_string, "%s", action_list[1]);
10539 for (a=0; a<(D1->A)->nseq; a++)
10540 for ( l=0,b=0; b< (D1->A)->len_aln; b++)
10542 r=(D1->A)->seq_al[a][b];
10544 if ( r!='\0' && strrchr (search_string, r))
10546 /*fprintf ( stdout, "%-15s res %c alnpos %4d seqpos %4d\n", (D1->A)->name[a], r, b+1, l);*/
10547 fprintf ( stdout, "%s %d %d\n", (D1->A)->name[a], l, l+1);
10550 myexit (EXIT_SUCCESS);
10552 else if ( strm (action, "merge_annotation"))
10554 D1->A=merge_annotation (D1->A, DST?DST->A:NULL, ACTION(1));
10555 D1->S=aln2seq (D1->A);
10557 else if ( strm (action, "color_residue"))
10563 DST->A=copy_aln (D1->A, NULL);
10564 DST->S=aln2seq (DST->A);
10565 for (a=0; a< (DST->S)->nseq; a++)ungap ((DST->S)->seq[a]);
10569 for (a=1; a<n_actions; a+=3)
10571 i=name_is_in_list(action_list[a], (D1->A)->name, (D1->A)->nseq, 100);
10574 (DST->S)->seq[i][atoi(action_list[a+1])-1]='0'+atoi(action_list[a+2])-1;
10576 else fprintf (stderr, "\nWARNING: Could not find Sequence %s", action_list[a]);
10585 fp=vfopen (action_list[1], "r");
10586 while (fscanf (fp, "%s %d %d\n", name, &pos, &val)==3)
10589 i=name_is_in_list(name, (D1->A)->name, (D1->A)->nseq, 100);
10590 if (i!=-1)(DST->S)->seq[i][pos-1]='0'+val;
10591 else fprintf (stderr, "\nWARNING: Could not find Sequence %s", action_list[a]);
10595 DST->A=seq2aln (DST->S, NULL, 1);
10597 else if ( strm (action, "edit_residue"))
10602 char mod[100], name[100];
10610 for (a=1; a<n_actions; a+=3)
10613 i=name_is_in_list(action_list[a], (D1->A)->name, (D1->A)->nseq, 100);
10616 pos=atoi(action_list[a+1]);
10619 sprintf (mod, "%s", action_list[a+2]);
10620 if ( strm (mod, "upper"))(D1->A)->seq_al[i][pos]=toupper((D1->A)->seq_al[i][pos]);
10621 else if ( strm (mod, "lower"))(D1->A)->seq_al[i][pos]=tolower((D1->A)->seq_al[i][pos]);
10622 else (D1->A)->seq_al[i][pos]=mod[0];
10624 else fprintf (stderr, "\nWARNING: Could not find Sequence %s", action_list[a]);
10630 fp=vfopen (action_list[1], "r");
10631 while (fscanf (fp, "%s %d %s\n", name, &pos, mod)==3)
10634 i=name_is_in_list(name, (D1->A)->name, (D1->A)->nseq, 100);
10638 if ( strm (mod, "upper"))(D1->A)->seq_al[i][pos]=toupper(A->seq_al[i][pos]);
10639 else if ( strm (mod, "lower"))A->seq_al[i][pos]=tolower(A->seq_al[i][pos]);
10640 else A->seq_al[i][pos]=mod[0];
10642 else fprintf(stderr, "\nWARNING: Could not find Sequence %s", action_list[a]);
10646 D1->S=aln2seq (D1->A);
10648 else if ( strm (action, "clean_flag"))
10650 clean_flag=1-clean_flag;
10652 else if ( strm (action, "aln2case"))
10654 D1->A=aln2case_aln (D1->A, ACTION(1), ACTION(2));
10655 D1->S=aln2seq(D1->A);
10658 else if ( strm5 (action, "convert","upper","lower", "keep", "switchcase"))
10662 if ( n_actions>1 && is_number (action_list[b]))
10664 lower_value=upper_value=atoi(action_list[b++]);
10666 else if ( n_actions>1 && strm (action_list[b], "gap"))
10668 DST=vcalloc (1,sizeof(Sequence_data_struc));
10669 DST->A=aln2gap_cache (D1->A,0);
10674 else if (n_actions>1 && action_list[b] && action_list[b][0]=='[')
10677 lower_value=atoi(strtok (action_list[b]+1, "-[]"));
10678 upper_value=atoi(strtok (NULL, "-[]"));
10684 lower_value=upper_value=-1;
10687 if ( n_actions >b ||strm (action, "keep") )
10689 if ( !RAD->symbol_list)RAD->symbol_list=declare_char (STRING, STRING);
10691 if ( strm (action, "keep") )sprintf ( RAD->symbol_list[RAD->n_symbol++], "#-");
10694 for (a=b; a< n_actions; a++)
10696 sprintf ( RAD->symbol_list[RAD->n_symbol], "%s", action_list[a]);
10702 for ( value=0; value<=9; value++)
10704 if ( lower_value==-1)value=-1;
10706 if ( (value>=lower_value && value<=upper_value)|| value==-1)
10708 if (strm(action,"convert")) D1->A=filter_aln_convert (D1->A, DST?DST->A:NULL,RAD->use_consensus,value,RAD->n_symbol, RAD->symbol_list);
10709 else if (strm(action,"upper"))D1->A=filter_aln_lower_upper (D1->A, DST?DST->A:NULL,RAD->use_consensus,value);
10710 else if (strm(action,"lower"))D1->A=filter_aln_upper_lower (D1->A, DST?DST->A:NULL,RAD->use_consensus,value);
10711 else if (strm(action,"switchcase"))D1->A=filter_aln_switchcase (D1->A, DST?DST->A:NULL,RAD->use_consensus,value);
10715 if (strm(action,"keep")) D1->A=filter_aln_convert (D1->A, DST?DST->A:NULL,RAD->use_consensus,value,RAD->n_symbol, RAD->symbol_list);
10717 if (value==-1)break;
10721 /*free_sequence (D1->S,(D1->S)->nseq);*/
10722 if (!D1->S)D1->S=aln2seq (D1->A);
10724 else if ( strm ( action, "count_pairs"))
10726 int a, b,c,v, **matrix;
10728 matrix=declare_int (300,300);
10730 for ( a=0; a< A->nseq-1; a++)
10731 for (b=0; b< A->nseq; b++)
10732 for (c=0; c<A->len_aln; c++)
10733 matrix[(int)A->seq_al[a][c]][(int)A->seq_al[b][c]]++;
10734 for ( a=0; a<255; a++)
10735 for ( b=a; b<256; b++)
10737 v=matrix[a][b]+matrix[b][a];
10738 if (v)fprintf ( stdout, "\n%c %c %d", a, b, v);
10740 exit (EXIT_SUCCESS);
10742 else if ( strm (action, "count_misc"))
10744 count_misc (D1->A, (!D2)?NULL:D2->A);
10746 else if ( strm (action, "count"))
10749 if ( n_actions>1 && is_number (action_list[b]))
10751 lower_value=upper_value=atoi(action_list[b++]);
10753 else if (n_actions>1 && action_list[b] && action_list[b] && action_list[b][0]=='[')
10756 lower_value=atoi(strtok (action_list[b]+1, "-[]"));
10757 upper_value=atoi(strtok (NULL, "-[]"));
10763 lower_value=upper_value=-1;
10767 if ( !RAD->symbol_list)RAD->symbol_list=declare_char (STRING, STRING);
10769 for (a=b; a< n_actions; a++)
10771 sprintf ( RAD->symbol_list[RAD->n_symbol], "%s", action_list[a]);
10775 for ( value=lower_value; value<=upper_value; value++)
10777 count_table=count_in_aln (D1->A, DST?DST->A:NULL,value,RAD->n_symbol, RAD->symbol_list, count_table);
10779 for ( a=0; a<RAD->n_symbol; a++)
10781 fprintf ( stdout, "%s %d\n", RAD->symbol_list[a], count_table[a]);
10783 free_sequence (D1->S,(D1->S)->nseq);
10784 D1->S=aln2seq (D1->A);
10785 vfree(count_table);
10786 exit(EXIT_SUCCESS);
10788 else if ( strm (action, "msa_weight"))
10791 char command [LONG_STRING];
10792 char aln_name[FILENAMELEN];
10793 char tree_name[FILENAMELEN];
10794 char dist_matrix_name[FILENAMELEN];
10795 char weight_name[FILENAMELEN];
10796 char method_4_msa_weights[1000];
10800 fprintf ( stderr, "\nError: msa_weight requires a weight_method");
10803 sprintf ( method_4_msa_weights, "%s", (get_env_variable ("METHOD_4_MSA_WEIGHTS",NO_REPORT))?get_env_variable ("METHOD_4_MSA_WEIGHTS",NO_REPORT):METHOD_4_MSA_WEIGHTS);
10805 /*1 Computation of the tree and the distance matrix*/
10806 random_value=addrand ((unsigned long) 100000)+1;
10807 sprintf (aln_name, "%d.aln", random_value);
10808 sprintf (tree_name, "%d.ph", random_value);
10809 sprintf (dist_matrix_name, "%d.dst", random_value);
10810 sprintf (weight_name, "%d.weight", random_value);
10811 output_fasta_aln (aln_name, D1->A);
10813 sprintf ( command, "clustalw -infile=%s -tree -outputtree=dist %s", aln_name, TO_NULL_DEVICE);
10814 my_system ( command);
10815 sprintf ( command, "%s -method %s -aln %s -tree %s -dmatrix %s -weightfile %s %s",method_4_msa_weights, action_list[1],aln_name, tree_name, dist_matrix_name,weight_name, TO_NULL_DEVICE);
10816 my_system ( command);
10818 (D1->A)->S=aln2seq (D1->A);
10819 ((D1->A)->S)->W=read_seq_weight ( (D1->A)->name, (D1->A)->nseq,weight_name);
10820 vremove (weight_name);
10821 vremove (aln_name);
10822 vremove (tree_name);
10823 vremove (dist_matrix_name);
10825 else if ( strm (action, "pavie_seq2random_seq"))
10827 D1->S=pavie_seq2random_seq (D1->S, action_list[1]);
10828 D1->A=seq2aln (D1->S,NULL,1);
10830 else if ( strm ( action, "pavie_seq2noisy_seq"))
10832 /*<amount of noise: 0-100> (<alp>)*/
10834 D1->S=pavie_seq2noisy_seq (D1->S, atoi(action_list[1]),ACTION(2));
10835 D1->A=seq2aln (D1->S,NULL,1);
10837 else if ( strm (action, "pavie_seq2pavie_mat"))
10840 pavie_seq2trained_pavie_mat ( D1->S, (n_actions==2)?action_list[1]:NULL);
10841 myexit (EXIT_SUCCESS);
10843 else if ( strm (action, "pavie_seq2pavie_aln"))
10846 pavie_seq2pavie_aln ( D1->S, action_list[1], ACTION(2));
10847 myexit (EXIT_SUCCESS);
10849 else if ( strm (action, "pavie_seq2pavie_dm"))
10851 if (strstr (ACTION2(2,""), "_MSA_"))
10852 D1->S=aln2seq_main(D1->A, KEEP_GAP);
10855 pavie_seq2pavie_aln ( D1->S, action_list[1],(n_actions==3)?action_list[2]:"_MATDIST_");
10856 myexit (EXIT_SUCCESS);
10858 else if ( strm (action, "pavie_seq2pavie_msa"))
10860 D1->A=pavie_seq2pavie_msa ( D1->S, action_list[1], (n_actions==3)?action_list[2]:NULL);
10862 else if ( strm (action, "pavie_seq2pavie_tree"))
10864 D1->T=pavie_seq2pavie_tree ( D1->S, action_list[1], (n_actions==3)?action_list[2]:NULL);
10866 else if ( strm (action, "pavie_seq2pavie_sort"))
10868 D1->A=pavie_seq2pavie_sort ( D1->S, action_list[1], (n_actions==3)?action_list[2]:NULL);
10871 else if ( strm (action, "aln2mat_diaa"))
10873 aln2mat_diaa (D1->S);
10875 else if ( strm (action, "aln2mat"))
10880 else if ( strm (action, "seq2latmat"))
10882 seq2latmat ( D1->S, "stdout");
10883 myexit (EXIT_SUCCESS);
10885 else if ( strm (action , "rm_target_pdb"))
10890 for (i=0; i< (D1->A)->nseq; i++)
10892 j=1;buf=(D1->A)->name[i];
10893 while (buf[j]!='_' && buf[j-1]!='_' && buf[j]!='\0')j++;
10897 else if ( strm ( action, "mat2cmp"))
10900 r=mat2cmp (D1->M, D2->M);
10901 fprintf ( stdout, "\nMATRIX COMPARISON: R=%.3f R2=%.3f On %d pairs of values\n", (float)r[0], (float)r[1], (int)r[2]);
10902 myexit (EXIT_SUCCESS);
10905 else if ( strm ( action, "overaln_list"))
10907 float *re, tre=0,sn, tsn=0, sp, tsp=0;
10908 int p1,p2,p3, t, f;
10914 HERE ("F P1 P2 P3 T");
10924 LA=vcalloc ((D1->A)->nseq, sizeof (Alignment*));
10925 LB=vcalloc ((D2->A)->nseq, sizeof (Alignment*));
10926 for (a=0; a<(D1->A)->nseq; a++)
10928 LA[a]=main_read_aln ((D1->A)->name[a], NULL);
10929 LB[a]=main_read_aln ((D2->A)->name[a], NULL);
10932 for ( a=0; a<(D1->A)->nseq; a++)
10937 re=analyze_overaln (A, B, "_case_l_",t,f,p1,p2,p3);
10938 fprintf (stdout, "\n%d: sn: %.2f sp: %.2f re: %.2f F: %d P: %d P2: %d T: %d",a, re[0],re[1],re[2],f, p1,p2,t);
10944 fprintf (stdout, "\nTOT: sn: %.2f sp: %.2f re: %.2f F: %d P: %d P2: %d T: %d", tsn/(D1->A)->nseq,tsp/(D1->A)->nseq, tre/(D1->A)->nseq,f,p1,p2,t);
10948 else if ( strm ( action, "overaln_list_scan"))
10950 float *re, tre=0, tsn=0, tsp;
10951 int p1,p2, p3, t, f;
10957 if ( ACTION(1))sprintf ( fname, "%s", ACTION(1));
10958 else sprintf ( fname, "scan_results.txt");
10960 fprintf ( stdout, "SCAN Results will be ouput in %s\n", fname);
10963 LA=vcalloc ((D1->A)->nseq, sizeof (Alignment*));
10964 LB=vcalloc ((D2->A)->nseq, sizeof (Alignment*));
10965 for (a=0; a<(D1->A)->nseq; a++)
10967 LA[a]=main_read_aln ((D1->A)->name[a], NULL);
10968 LB[a]=main_read_aln ((D2->A)->name[a], NULL);
10970 for (f=32; f<=40; f++)
10972 for (p1=90; p1<=100; p1+=5)
10974 for ( t=1; t<=3; t++)
10976 for (p2=0; p2<=40; p2+=5)
10978 for (p3=0;p3<=0;p3+=5)
10981 for ( a=0; a<(D1->A)->nseq; a++)
10986 re=analyze_overaln (A, B, "_case_l_",t,f,p1,p2,p3);
10993 fp=vfopen (fname, "a");
10994 fprintf (fp, "\nTOT: sn: %.2f sp: %.2f re: %.2f P: %d P2: %d P3: %d T: %d F: %d", tsn/(D1->A)->nseq,tsp/(D1->A)->nseq, tre/(D1->A)->nseq, p1,p2, p3,t,f);
10995 fprintf (stderr, "\nTOT: sn: %.2f sp: %.2f re: %.2f P: %d P2: %d P3: %d T: %d F: %d", tsn/(D1->A)->nseq,tsp/(D1->A)->nseq, tre/(D1->A)->nseq, p1,p2, p3,t,f);
11004 else if ( strm ( action, "overaln"))//Evaluate the capacity to predict over-aligned regions
11007 F=vcalloc (1, sizeof (OveralnP));
11010 //ATOI(1): P (0-100)
11014 DST=vcalloc (1,sizeof(Sequence_data_struc));
11015 DST->A=aln2gap_cache (D1->A,0);
11018 D1->A=filter_aln_upper_lower (D1->A, DST->A, 0, 0);
11020 sprintf (F->mode, "%s", ((s=get_string_variable ("overaln_mode")))?s:"lower");
11021 if (!strm (F->mode, "lower") && !strm (F->mode, "unalign"))printf_exit (EXIT_FAILURE,stderr,"\nERROR: unknown overal_mode in overal output [%s] [FATAL:%s]", F->mode, PROGRAM);
11023 if (int_variable_isset ("overaln_threshold"))F->t=get_int_variable ("overaln_threshold");
11024 if (int_variable_isset ("overaln_target"))F->f=get_int_variable ("overaln_target");
11025 if (int_variable_isset ("overaln_P1"))F->f=get_int_variable ("overaln_P1");
11026 if (int_variable_isset ("overaln_P1"))F->f=get_int_variable ("overaln_P2");
11027 if (int_variable_isset ("overaln_P1"))F->f=get_int_variable ("overaln_P3");
11028 if (int_variable_isset ("overaln_P1"))F->f=get_int_variable ("overaln_P4");//F P1 P2 P3 T;
11030 D2->A=aln2clean_pw_aln (D2->A, F);
11031 r=aln2pred (D1->A, D2->A,"case_l_");
11032 fprintf ( stdout, "sn %.2f sp %.2f re %.2f\n", r[0], r[1], r[2]);
11038 else if ( strm ( action, "aln2hitMat"))
11040 aln2hitMat(D1->A, ACTION(1));
11041 myexit (EXIT_SUCCESS);
11047 fprintf ( stderr, "\nWARNING: ACTION %s UNKNOWN and IGNORED\n", action);
11052 void aln2mat_diaa (Sequence *S)
11054 int a, aa1, aa2, aa3, aa4;
11062 double Delta=0.00001;
11065 double observed, expected, f_diaa1, f_diaa2, v;
11068 alp=vcalloc (256, sizeof (int));
11069 for (a=0; a<26; a++)alp[a+'a']=1;
11077 m=declare_arrayN (4,sizeof (int),26,26,26,26);
11078 c=declare_arrayN (2,sizeof (int),26,26);
11080 for ( a=0; a< S->nseq; a++)
11082 fprintf ( stderr, "%s\n", S->name[a]);
11083 A=main_read_aln (S->name[a],NULL);
11084 for (s1=0; s1<A->nseq; s1++)lower_string (A->seq_al[s1]);
11086 for ( s1=0; s1<A->nseq-1; s1++)
11087 for (s2=s1+1; s2<A->nseq; s2++)
11089 for (p=0; p<A->len_aln-1; p++)
11092 u =alp[aa1=A->seq_al[s1][p]];
11093 u+=alp[aa2=A->seq_al[s1][p+1]];
11094 u+=alp[aa3=A->seq_al[s2][p]];
11095 u+=alp[aa4=A->seq_al[s2][p+1]];
11099 aa1-='a';aa2-='a';aa3-='a'; aa4-='a';
11103 m[aa1][aa2][aa3][aa4]++;
11110 fprintf ( stdout, "# DIAA_MATRIX_FORMAT_01\n");
11112 for (aa1=0; aa1<naa; aa1++)
11113 for (aa2=0; aa2<naa; aa2++)
11114 for (aa3=0; aa3<naa; aa3++)
11115 for (aa4=0; aa4<naa;aa4++)
11124 tot=m[aa1][aa2][aa3][aa4]+m[aa3][aa4][aa1][aa2];
11125 observed=((double)tot)/(double)((double)count/(double)2);
11126 f_diaa1=(double)c[aa1][aa2]/(double)count;
11127 f_diaa2=(double)c[aa3][aa4]/(double)count;
11129 expected=f_diaa1*f_diaa2;
11130 if (expected<Delta)v=0;
11131 else if (observed<Delta)v=-100;
11134 v=log(observed/expected)*10;
11136 // if (tot>0)fprintf ( stdout, "TEST C=%d expected=%.4f observed=%.4f v=%.4f [%d %d %d][%d] tot=%d\n", count, (float)expected, (float)observed, (float) v, c[aa1][aa2], c[aa3][aa4], count, m[aa1][aa2][aa3][aa4], tot);
11137 fprintf ( stdout, "%c%c %c%c %d %d\n", aa1+'a', aa2+'a', aa3+'a', aa4+'a', (int)v, m[aa1][aa2][aa3][aa4]+ m[aa3][aa4][aa1][aa2]);
11140 exit (EXIT_SUCCESS);
11142 void aln2mat (Sequence *S)
11152 double Delta=0.00001;
11155 double observed, expected, f_diaa1, f_diaa2, v;
11158 alp=vcalloc (256, sizeof (int));
11159 for (a=0; a<26; a++)alp[a+'a']=1;
11167 m=declare_arrayN (2,sizeof (int),26,26);
11168 c=declare_arrayN (1,sizeof (int),26);
11170 for ( a=0; a< S->nseq; a++)
11172 fprintf ( stderr, "%s\n", S->name[a]);
11173 A=main_read_aln (S->name[a],NULL);
11174 for (s1=0; s1<A->nseq; s1++)lower_string (A->seq_al[s1]);
11176 for ( s1=0; s1<A->nseq-1; s1++)
11177 for (s2=s1+1; s2<A->nseq; s2++)
11179 for (p=0; p<A->len_aln-1; p++)
11182 u =alp[aa1=A->seq_al[s1][p]];
11183 u+=alp[aa3=A->seq_al[s2][p]];
11198 fprintf ( stdout, "# MONOAA_MATRIX_FORMAT_01\n");
11200 for (aa1=0; aa1<naa; aa1++)
11201 for (aa3=0; aa3<naa; aa3++)
11208 tot=m[aa1][aa3]+m[aa3][aa1];
11209 observed=((double)tot)/(double)((double)count/(double)2);
11210 f_diaa1=(double)c[aa1]/(double)count;
11211 f_diaa2=(double)c[aa3]/(double)count;
11213 expected=f_diaa1*f_diaa2;
11214 if (expected<Delta)v=0;
11215 else if (observed<Delta)v=-100;
11218 v=log(observed/expected)*10;
11220 // if (tot>0)fprintf ( stdout, "TEST C=%d expected=%.4f observed=%.4f v=%.4f [%d %d %d][%d] tot=%d\n", count, (float)expected, (float)observed, (float) v, c[aa1][aa2], c[aa3][aa4], count, m[aa1][aa2][aa3][aa4], tot);
11221 fprintf ( stdout, "%c %c %d %d\n", aa1+'a', aa3+'a', (int)v, m[aa1][aa3]+ m[aa3][aa1]);
11224 exit (EXIT_SUCCESS);
11228 int **seq2latmat ( Sequence *S, char *fname)
11235 double observed, expected;
11238 fp=vfopen (fname, "w");
11240 count=vcalloc ( 256, sizeof (int));
11241 mat=declare_int (256, 256);
11243 naa=strlen ( BLAST_AA_ALPHABET);
11244 aa=vcalloc ( naa+2, sizeof (char));
11245 sprintf ( aa, "%s", BLAST_AA_ALPHABET);
11248 for ( tot=0,a=0; a< S->nseq; a++)
11251 for ( b=1; b<S->len[a]; b++)
11253 r0=tolower(S->seq[a][b-1]);
11254 r1=tolower(S->seq[a][b]);
11262 for ( a=0; a< naa; a++)
11263 for (b=0; b< naa; b++)
11265 if ( aa[a]=='*' || aa[b]=='*');
11268 expected=((double)count[(int)aa[a]]/(double)tot)* ((double)count[(int)aa[b]]/(double)tot)*(double)tot;
11269 observed=((double)mat[(int)aa[a]][(int)aa[b]]);
11272 fprintf ( stderr, "\n%c=%d %c=%d Tot=%d Obs=%d Exp=%d\n", aa[a],count[aa[a]], aa[b],count[aa[b]],tot, mat[aa[a]][aa[b]],(int)expected);
11273 fprintf ( stderr, "\n%d", mat[aa[a]][aa[b]]);
11274 fprintf ( stderr, "\n%d", mat[aa[a]][aa[b]]);
11276 mat[(int)aa[a]][(int)aa[b]]=(expected==0 || observed==0)?0:((int)10*log((observed/expected)));
11280 fprintf (fp,"# BLAST_MATRIX FORMAT\n#ALPHABET=%s\n#TRANSITION MATRIX TRAINED ON %d Sequence\n#", BLAST_AA_ALPHABET, S->nseq);
11281 for (a=0; a< naa; a++)fprintf ( fp, "%3c ", toupper(aa[a]));
11283 for (a=0; a< naa; a++)
11286 fprintf (fp, "%c", toupper(aa[a]));
11287 for ( b=0; b< naa; b++)
11289 fprintf (fp, "%3d ", mat[(int)aa[a]][(int)aa[b]]);
11291 fprintf ( fp, "\n");
11300 double* mat2cmp ( int **mat1, int **mat2)
11304 if ( !mat1 || !mat2)
11306 fprintf ( stderr, "\nERROR: mat2cmp needs two matrices [FATAL:%s]", PROGRAM);
11307 myexit (EXIT_FAILURE);
11310 for (n=0, a=0; a< 256; a++)
11311 for ( b=0; b<256; b++)
11317 if ( n==0) return 0;
11318 list=declare_double (n, 2);
11320 for (n=0, a=0; a<256; a++)
11321 for ( b=0; b<256; b++)
11332 r=return_r (list, n);
11333 free_double(list, -1);
11337 int ** read_blast_matrix ( char *mat_name)
11344 char sbuf[VERY_LONG_STRING];
11348 matrix=declare_int (256,256);
11349 vfree ( matrix[30]);
11350 matrix[30]=vcalloc(10000, sizeof (int));
11351 fp=vfopen ( mat_name, "r");
11352 while ( (c=fgetc(fp))=='#' || isspace(c) )
11355 fgets ( sbuf, VERY_LONG_STRING, fp);
11356 if ( (p=strstr (sbuf, "ALPHABET")))
11357 sscanf (p, "ALPHABET=%s", alp);
11360 lower_string (alp);
11363 for ( a=0; a< n_aa; a++)
11365 fscanf ( fp, "%s ", buf);
11367 aa1=tolower(buf[0]);
11371 fprintf ( stderr, "\nParsing_error when reading blast_matrix %s:\n%c %c",mat_name, aa1,alp[a]);
11372 fprintf ( stderr, "\n%c ", fgetc(fp));
11373 myexit (EXIT_FAILURE);
11375 for ( b=0; b<n_aa; b++)
11377 aa2=tolower ((char) alp[b]);
11378 fscanf ( fp, "%d ", &value);
11379 if (is_gap(aa1) || is_gap(aa2))
11382 c1=(is_gap(aa1))?GAP_CODE:aa1;
11383 c2=(is_gap(aa2))?GAP_CODE:aa2;
11384 if ( c1==GAP_CODE && c2==GAP_CODE)
11385 matrix[c1][c2]=value;
11386 else if ( c1==GAP_CODE)
11388 matrix[c1][tolower(c2)]=value;
11389 matrix[c1][toupper(c2)]=value;
11393 matrix[tolower(c1)][c2]=value;
11394 matrix[toupper(c1)][c2]=value;
11397 else if ( aa1!='*' && aa2!='*')
11399 matrix[tolower(aa1)-'A'][tolower(aa2)-'A']=value;
11400 matrix[toupper(aa1)-'A'][toupper(aa2)-'A']=value;
11401 matrix[tolower(aa1)-'A'][toupper(aa2)-'A']=value;
11402 matrix[toupper(aa1)-'A'][tolower(aa2)-'A']=value;
11412 int output_blast_mat (int **mat, char *fname)
11414 return output_mat(mat, fname, BLAST_AA_ALPHABET, 'A');
11418 int output_mat (int **mat, char *fname, char *alp, int offset)
11427 aa=vcalloc ( naa+2, sizeof (char));
11428 sprintf ( aa, "%s",alp);
11430 if (!(fp=vfopen (fname, "w")))return 0;
11431 fprintf (fp,"# BLAST_MATRIX FORMAT\n#ALPHABET=%s\n",alp);
11432 for (a=0; a< naa; a++)fprintf ( fp, "%3c ", toupper(aa[a]));
11434 for (a=0; a< naa; a++)
11437 fprintf (fp, "%c", toupper(aa[a]));
11438 for ( b=0; b< naa; b++)
11440 fprintf (fp, " %5d", mat[aa[a]-offset][aa[b]-offset]);
11442 fprintf ( fp, "\n");
11449 void output_pavie_mat (int **mat, char *fname, double gep, char *alp)
11455 fp=vfopen (fname, "w");
11456 fprintf (fp,"# PAVIE_MATRIX FORMAT\n#ALPHABET=%s\n",alp);
11458 for(a=0; a< n; a++)
11460 for ( b=a; b<n; b++)
11462 fprintf (fp, "%c %c %.3f\n", toupper(alp[a]), toupper(alp[b]), (float)mat[alp[a]-'A'][alp[b]-'A']/PAVIE_MAT_FACTOR);
11465 if ( gep!=UNDEFINED)fprintf ( fp, "- - %.3f\n", gep/PAVIE_MAT_FACTOR);
11469 int ** read_pavie_matrix ( char *mat_name)
11476 char sbuf[VERY_LONG_STRING];
11480 matrix=declare_int (256,256);
11483 fp=vfopen ( mat_name, "r");
11484 while ( (c=fgetc(fp))=='#' || isspace(c) )
11486 fgets ( sbuf, VERY_LONG_STRING, fp);
11487 if ( sscanf (sbuf, "ALPHABET=%s", alp)==1);
11492 while ( fgets ( sbuf, VERY_LONG_STRING, fp)!=NULL)
11495 if (sscanf (sbuf, "%c %c %f",&aa1, &aa2, &v)==3)
11497 v*=PAVIE_MAT_FACTOR;
11498 if (aa1=='-' && aa2=='-')gep=v;
11501 matrix[tolower(aa1)-'A'][tolower(aa2)-'A']=v;
11502 matrix[toupper(aa1)-'A'][toupper(aa2)-'A']=v;
11503 matrix[tolower(aa1)-'A'][toupper(aa2)-'A']=v;
11504 matrix[toupper(aa1)-'A'][tolower(aa2)-'A']=v;
11506 matrix[tolower(aa2)-'A'][tolower(aa1)-'A']=v;
11507 matrix[toupper(aa2)-'A'][toupper(aa1)-'A']=v;
11508 matrix[tolower(aa2)-'A'][toupper(aa1)-'A']=v;
11509 matrix[toupper(aa2)-'A'][tolower(aa1)-'A']=v;
11513 if ( gep!=UNDEFINED)
11516 for (a=0; a< n_aa; a++)
11518 if (!matrix[tolower(alp[a])-'A'][GAP_CODE])
11520 matrix[tolower(alp[a])-'A'][GAP_CODE]=gep;
11521 matrix[toupper(alp[a])-'A'][GAP_CODE]=gep;
11529 Sequence *seq2year ( Sequence *S, int modulo)
11534 char new_channel[100];
11536 sprintf( new_channel, "_agechannel%d",modulo);
11538 for ( a=0; a<S->nseq; a++)
11540 if (S->seq_comment[a] && (s=strstr(S->seq_comment[a], "_FIRSTYEAR")))
11542 sscanf (s, "_FIRSTYEAR%d_", &first);
11546 for ( y=first,b=0; b<S->len[a]; b++)
11548 if ( !is_gap(S->seq[a][b]))
11550 S->seq[a][b]='a'+((y/modulo))%10;
11554 if ( (s=strstr ( S->name[a], "_agechannel")))
11556 sprintf ( s, "%s", new_channel);
11558 else strcat (S->name[a], new_channel);
11563 Sequence* output_n_pavie_age_channel (Sequence *S, char *name, int n)
11569 for ( x=1,a=0; a< n; a++, x*=10)
11571 S=output_pavie_age_channel(S, name,x);
11579 Sequence* output_pavie_age_channel (Sequence *S, char *name, int modulo)
11583 static int display;
11584 char mat_list_name[100];
11585 char seq_list[1000];
11586 char mat_name[1000];
11589 sprintf ( mat_list_name, "%s_pavie_age_matrix.mat_list", name);
11590 sprintf (seq_list, "%s_age_channel.fasta",name);
11594 if (check_file_exists(seq_list))vremove (seq_list);
11595 if (check_file_exists(mat_list_name))vremove (mat_list_name);
11597 sprintf (mat_name, "%s_age_mat_mod%d.mat",name, modulo);
11598 output_age_matrix ( mat_name, modulo);
11600 fp=vfopen ( mat_list_name,"a");
11601 fprintf ( fp, "%s\n", mat_name);
11604 S=seq2year (S,modulo);
11605 A=seq2aln (S, NULL, KEEP_GAP);
11606 output_fasta_seq (tmp=vtmpnam (NULL),A);
11607 file_cat ( tmp, seq_list);
11611 display_output_filename ( stdout, "AGE_MAT_LIST", "MAT_LIST", mat_list_name, CHECK);
11612 display_output_filename ( stdout, "AGE_SEQ", "FASTA", seq_list, CHECK);
11615 fprintf ( stderr, "\nModulo:%d years", modulo);
11616 fprintf ( stderr, "\n");
11621 // Name MAnipulation
11624 Alignment *clean_aln (Alignment *A)
11628 A->seq_comment=clean_string (A->nseq, A->seq_comment);
11629 A->aln_comment=clean_string (A->nseq, A->aln_comment);
11630 A->name=translate_names(A->nseq, A->name);
11631 (A->S)=clean_sequence ((A->S));
11635 Sequence *clean_sequence ( Sequence *S)
11639 S->seq_comment=clean_string (S->nseq, S->seq_comment);
11640 S->name=translate_names(S->nseq, S->name);
11643 char ** translate_names (int n, char **name)
11646 for ( a=0; a<n; a++)
11647 name[a]=translate_name(name[a]);
11650 char * translate_name ( char *name)
11659 if ( name[0]=='\'')return name;
11661 for ( a=0; a<len; a++)
11663 if ( isspace(name[a]))name[a]='\0';
11664 else if (strchr (";(),:#><", name[a]))name[a]='_';
11667 sprintf (buf,"%s",decode_name (name, DECODE));
11668 if ( strlen (buf)>read_array_size_new ((char *)name))
11670 name=vrealloc (name, sizeof (char)*(strlen (buf)+1));
11672 sprintf (name, "%s", buf);
11676 char *decode_name (char *name, int mode)
11678 static char ***name_list;
11680 static char tag[100];
11685 for (a=0; a<n; a++)
11687 vfree (name_list[a][0]);
11688 vfree (name_list[a][1]);
11689 vfree (name_list[a]);
11696 if ( mode == CODELIST)
11699 file=vtmpnam (NULL);
11700 for (a=0; a< n; a++)
11701 printf_file(file, "a", "#CODE: %s <=> %s\n", name_list[a][0], name_list[a][1]);
11704 if (mode ==DECODE && name_list==NULL)return name;
11705 if ( name==NULL) return name;
11712 sprintf ( tag, "TCTAG_%d",rand ()%100000);
11717 for (a=0; a< n; a++)
11718 if ( strm (name, name_list[a][0]))return name_list[a][1];
11721 name_list=realloc (name_list, sizeof (char**)*(n+1));
11722 name_list[n]=vcalloc (2, sizeof (char*));
11723 name_list[n][0]=vcalloc (strlen (name)+1, sizeof (char));
11724 name_list[n][1]=vcalloc (100, sizeof (char));
11725 sprintf ( name_list[n][0], "%s", name);
11726 sprintf ( name_list[n][1], "%s_%d", tag,n+1);
11727 return name_list[n++][1];
11729 else if ( mode ==DECODE)
11733 if ( !(p=after_strstr (name, tag)))return name;
11736 sscanf (p, "_%d", &i);
11737 return name_list[i-1][0];
11742 printf_exit (EXIT_FAILURE, stderr,"Unknown Mode for Decode_name [FATAL:%s]", PROGRAM);
11748 FILE * display_sequences_names (Sequence *S, FILE *fp, int check_pdb_status, int print_templates)
11756 fprintf (fp,"\nERROR: NO SEQUENCE READ [FATAL:%s]\n", PROGRAM); myexit (EXIT_FAILURE);
11758 for ( a=0, max_len=0; a< S->nseq; a++)max_len=MAX(max_len, strlen (S->name[a]));
11759 fprintf ( fp, "\nINPUT SEQUENCES: %d SEQUENCES [%s]", S->nseq,(S->type)?S->type:"Unknown type");
11760 for ( a=0; a< S->nseq; a++)
11762 fprintf (fp, "\n Input File %-*s Seq %-*s Length %4d type %s",max_len,S->file[a], max_len,S->name[a],(int)strlen ( S->seq[a]), S->type);
11763 if (check_pdb_status)
11765 if ((r=seq_is_pdb_struc (S, a)))fprintf (fp, " Struct Yes PDBID %s", get_pdb_id(r));
11766 else fprintf (fp, " Struct No");
11768 if (is_pdb_struc (S->name[a])||is_pdb_struc (S->file[a]) )fprintf (fp, " Struct Yes");
11769 else fprintf (fp, " Struct No");
11772 else fprintf (fp, " Struct Unchecked");
11773 if ( print_templates)fp=display_sequence_templates (S, a, fp);
11777 fprintf ( fp, "\n");
11781 Sequence *add_file2file_list (char *name, Sequence *S)
11784 if (!S) S=declare_sequence (1,1,10);
11785 else S=realloc_sequence (S,S->nseq+1,0);S->nseq=0;
11787 sprintf ( S->name[S->nseq++], "%s", name);
11791 /*********************************COPYRIGHT NOTICE**********************************/
11792 /*© Centro de Regulacio Genomica */
11794 /*Cedric Notredame */
11795 /*Tue Oct 27 10:12:26 WEST 2009. */
11796 /*All rights reserved.*/
11797 /*This file is part of T-COFFEE.*/
11799 /* T-COFFEE is free software; you can redistribute it and/or modify*/
11800 /* it under the terms of the GNU General Public License as published by*/
11801 /* the Free Software Foundation; either version 2 of the License, or*/
11802 /* (at your option) any later version.*/
11804 /* T-COFFEE is distributed in the hope that it will be useful,*/
11805 /* but WITHOUT ANY WARRANTY; without even the implied warranty of*/
11806 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the*/
11807 /* GNU General Public License for more details.*/
11809 /* You should have received a copy of the GNU General Public License*/
11810 /* along with Foobar; if not, write to the Free Software*/
11811 /* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA*/
11812 /*............................................... |*/
11813 /* If you need some more information*/
11814 /* cedric.notredame@europe.com*/
11815 /*............................................... |*/
11819 /*********************************COPYRIGHT NOTICE**********************************/