8 #include "io_lib_header.h"
9 #include "util_lib_header.h"
10 #include "dp_lib_header.h"
11 #include "define_header.h"
12 #include "dev1_lib_header.h" //JM_STRAT
14 #define ACTION(x) ((n_actions>=(x+1))?action_list[x]:NULL)
15 #define ACTION2(x,y) ((n_actions>=(x+1))?action_list[x]:y)
16 #define ATOI_ACTION(x) ((ACTION(x)!=NULL)?(atoi(ACTION(x))):0)
18 /**************************************************************************************************/
19 /***************************** SEQ_REFORMAT ******************************************/
20 /**************************************************************************************************/
21 int output_transitions(char *outfile, Alignment *A);
22 static int output_age_matrix ( char *outfile, int val);
23 int SeqGCGCheckSum(char *seq, int len);
24 static Sequence *seq2year ( Sequence *S, int modulo);
25 static Sequence* output_n_pavie_age_channel (Sequence *S, char *name, int n);
26 static Sequence* output_pavie_age_channel (Sequence *S, char *name, int modulo);
28 static int output_seq2struc(char *outfile, Alignment *A);
29 void output_conservation_statistics ( char *file, Alignment *A);
30 /**************************************************************************************************/
31 /***************************** SEQ_REFORMAT ******************************************/
32 /**************************************************************************************************/
33 int seq_reformat ( int argc, char **in_argv)
36 Sequence_data_struc *D1=NULL;
37 Sequence_data_struc *D2=NULL;
38 Sequence_data_struc *D_ST=NULL;
39 Action_data_struc *RAD;
52 char *struc_in_format;
53 char *struc_out_format;
60 char ***rename_list=NULL;
68 RAD=vcalloc ( 1, sizeof ( Action_data_struc));
70 declare_name (cache);sprintf ( cache, "use");
71 declare_name(in_file);
72 declare_name(in2_file);
73 declare_name(out_file);
74 declare_name(out2_file);
75 declare_name(struc_in_format);
76 declare_name(struc_out_format);
77 declare_name(RAD->coor_file);
79 declare_name(struc_in_file);
80 declare_name(struc_out_file);
81 declare_name(in_format);
82 declare_name(in2_format);
83 declare_name(out_format);
84 declare_name(rename_file);
87 argv=break_list ( in_argv, &argc, "=;, \n");
89 action_list=declare_char ( 100, 100);
91 /*END INITIALIZATION*/
93 addrandinit ( (unsigned long) 500);
95 if ( argc==1 || strm6 ( argv[1], "h", "-h", "help", "-help", "-man", "?"))
98 fprintf ( stdout, "\n%s (%s,%s,%s [%s])\n",PROGRAM, VERSION,AUTHOR, DATE, URL);
99 fprintf ( stdout, "\n*********** MINIMUM SYNTAX *****************");
100 fprintf ( stdout, "\nseq_reformat -in <in_file> -output <out_format>");
101 fprintf ( stdout, "\nSome File formats are automatically recognised");
102 fprintf ( stdout, "\nSee Format section");
103 fprintf ( stdout, "\n");
104 fprintf ( stdout, "\n*********** MAIN FLAGS ******************");
105 fprintf ( stdout, "\n-in name........Name of the file read");
108 fprintf ( stdout, "\n-input format......Name of the format read, see Input Format Section");
109 fprintf ( stdout, "\n...................Automatic detection, except for seqs of numbers");
110 fprintf ( stdout, "\n...................-input number_aln | number_fasta");
111 fprintf ( stdout, "\n-in2 fname......Second alignment");
112 fprintf ( stdout, "\n-input2 format.....See -input");
113 fprintf ( stdout, "\n-exon_boundaries obj file");
114 fprintf ( stdout, "\n-out fname......Output file (defualt is STDOUT");
115 fprintf ( stdout, "\n-output format.....Output Format, default is fasta_aln");
116 fprintf ( stdout, "\n-struc_in name...File containing a coded aln");
117 fprintf ( stdout, "\n-struc_in_f format.See -input and output format section");
118 fprintf ( stdout, "\n-struc_out fname..Name of the output structure");
119 fprintf ( stdout, "\n-struc_out_f symbol");
120 fprintf ( stdout, "\n-keep_case=on|off..keep case, On by default");
121 fprintf ( stdout, "\n-action +ac1 +ac2..See the action section");
122 fprintf ( stdout, "\n-rename <file>.....Rename the sequences following <file> indications");
123 fprintf ( stdout, "\n...................File Format: One couple <name1><space><name2>/line");
124 fprintf ( stdout, "\n...................Rename order <name1> into <name2>");
125 fprintf ( stdout, "\n...................code file: -output code_name");
126 fprintf ( stdout, "\n-code <file> Rename file <name1> to <name2>");
127 fprintf ( stdout, "\n-decode <file> Rename file <name2> to <name1>");
128 fprintf ( stdout, "\n-no_warning........Suppresses all warnings");
129 fprintf ( stdout, "\n-cache.............use,ignore,update,local, DirectoryName");
132 fprintf ( stdout, "\n");
134 fprintf ( stdout, "\n*********** REFORMAT ACTIONS *****************");
135 fprintf ( stdout, "\n +Xaction.............Specifies which file undergoes the action");
136 fprintf ( stdout, "\n +Xaction.............X=1: -in");
137 fprintf ( stdout, "\n +Xaction.............X=2: -in2");
138 fprintf ( stdout, "\n +Xaction.............X=3: -struc_in");
139 fprintf ( stdout, "\n +name2unique_name....replace duplicated name with name_#");
140 fprintf ( stdout, "\n +swap_header........,swapp comments: replace comments/name in 1 by in 2");
141 fprintf ( stdout, "\n +swap_lib_header.F...Replace the sequences in the tc_lib (-in) with those in F");
142 fprintf ( stdout, "\n .....................F is a legal FASTA file");
145 fprintf ( stdout, "\n +translate[0-2]......Translate on Frame 0, 1, 2 ");
146 fprintf ( stdout, "\n +translate[3]........longuest ORF on direct strand");
147 fprintf ( stdout, "\n +translate[4]........longuest ORF on direct+complementary strand");
150 fprintf ( stdout, "\n +add_scale..<offset>.addscale below aln");
152 fprintf ( stdout, "\n +rm_gap n ...........Removes col with n%% gap [n=100]");
153 fprintf ( stdout, "\n +rmgap_col SEQ1:SEQ2.Removes column with a gap in SEQ [#] ");
155 fprintf ( stdout, "\n +backtranslate.......Random Backtranslation");
156 fprintf ( stdout, "\n +complement..........Produces the reverse complement");
158 fprintf ( stdout, "\n +reorder.............Reorders sequences of <in> according to <in2>");
159 fprintf ( stdout, "\n .........random......Random_order");
160 fprintf ( stdout, "\n .........tree........Tree Order (in2)");
161 fprintf ( stdout, "\n +reorder_column.....Reorders sequences of <in> according to <in2>");
162 fprintf ( stdout, "\n .........random......Random_order");
163 fprintf ( stdout, "\n .........tree..mode..Tree Order (comuted with mode: sarmat, idmat, blosum62mt...");
164 fprintf ( stdout, "\n +aln2random_aln SCR..Randomize the aln, S: swap sequences names");
165 fprintf ( stdout, "\n .....................Swap residues within colums");
166 fprintf ( stdout, "\n .....................Swap residues across the aln");
167 fprintf ( stdout, "\n +aln2sample......N......");
168 fprintf ( stdout, "\n +aln2bootstrap...N......");
171 fprintf ( stdout, "\n +chain...............Identifies all the intermediate sequences from <-in>");
172 fprintf ( stdout, "\n .....................needed to join every sequence pair in <-in2>");
174 fprintf ( stdout, "\n +aln2cons mat_name..Ouputs a consensus sequence");
175 fprintf ( stdout, "\n .....................The consensus is determined using mat");
176 fprintf ( stdout, "\n .....................By Default, mat=blosum62mt, name=Cons");
177 fprintf ( stdout, "\n +aln2resindex........Prints the sequence index of each residue in -in for each -in2 sequence");
178 fprintf ( stdout, "\n +collapse_aln <new name> <seq1> <seq2...> | file name");
179 fprintf ( stdout, "\n .....................Replaces a group of sequences with its consensus");
180 fprintf ( stdout, "\n .....................The replacement sequence is named <new_seq>");
181 fprintf ( stdout, "\n .....................List of sequences can be provided via a file");
182 fprintf ( stdout, "\n .....................File:>new_name seq1 seq2 seq3....");
183 fprintf ( stdout, "\n +original_seqnos.....Keep original seqnos [SWITCH]");
184 fprintf ( stdout, "\n +seqnos..............Print Seqnos [SWITCH]");
185 fprintf ( stdout, "\n +code_dna_aln........Undocumented") ;
186 fprintf ( stdout, "\n +grep..[NAME|SEQ|COMMENT]..[KEEP|REMOVE]..[string]......");
187 fprintf ( stdout, "\n .....................Keeps or Removes Sequences matching string");
188 fprintf ( stdout, "\n +extract_block <seq> <start> <end> | <seq> <pos> |<filename>");
189 fprintf ( stdout, "\n .....................Extract column pos OR [start to end[");
190 fprintf ( stdout, "\n .....................<filename> Format");
191 fprintf ( stdout, "\n .......................seq start end | seq pos");
192 fprintf ( stdout, "\n .......................# for comments");
193 fprintf ( stdout, "\n .......................! seq offset_value (0 by default)");
194 fprintf ( stdout, "\n .....................Can extract as many positions as needed");
195 fprintf ( stdout, "\n .....................seq=cons: measure positions on the full aln");
196 fprintf ( stdout, "\n +cat_aln.............Concatenates the alignments input via -in and -in2");
197 fprintf ( stdout, "\n +cat_aln.............-if no -in2, -in is expected to be a list of alignments to concatenate");
198 fprintf ( stdout, "\n +orthologous_cat..<mode>: mode=voronoi or nothing");
199 fprintf ( stdout, "\n ......................-in: sequences from different species");
200 fprintf ( stdout, "\n ..................... -in2: list of species in fasta");
201 fprintf ( stdout, "\n ..................... sequence must be named: <species>_<genename>");
202 fprintf ( stdout, "\n ..................... all paralogues will be concatenated");
204 fprintf ( stdout, "\n +aln2replicate N name");
205 fprintf ( stdout, "\n ..................... Generates N replicates in Fasta");
206 fprintf ( stdout, "\n ..................... Voronoi weights can be used");
208 fprintf ( stdout, "\n +msalist2cat_pwaln.min..max");
209 fprintf ( stdout, "\n .....................extract all pw projections and conctaenates those\n");
210 fprintf ( stdout, "\n .....................where id>=min and id<=max\n");
211 fprintf ( stdout, "\n .....................min and max can be omitted (min=0, max=100)\n");
213 fprintf ( stdout, "\n +seq2blast <matrix>..gather all possible homologues from NR (EBI BLAST)");
214 fprintf ( stdout, "\n +seq2msa <matrix>....makes a standard progressive alignment using matrix");
215 fprintf ( stdout, "\n +realign_block <c1> <c2> <pg>");
216 fprintf ( stdout, "\n .....................Realign column c1 to c2 (non inc.) with pg)");
217 fprintf ( stdout, "\n .....................pg reads fasta and outputs fasta");
218 fprintf ( stdout, "\n .....................pg -infile=<infile> -outfile=<outfile>");
219 fprintf ( stdout, "\n +extract_seq seq_name (start end seq_name start end...) | filename");
220 fprintf ( stdout, "\n .....................seq_name='*': every seq");
221 fprintf ( stdout, "\n .....................start='*' : real start");
222 fprintf ( stdout, "\n .....................end='*' : real end");
223 fprintf ( stdout, "\n .....................filename: fasta format");
224 fprintf ( stdout, "\n +extract_seq_list name1 name2");
225 fprintf ( stdout, "\n .....................Extracts entire sequences");
226 fprintf ( stdout, "\n +remove_seq sn1 sn2..Removes sequences sn1, sn2...");
227 fprintf ( stdout, "\n +remove_seq empty....Removes empty sequences (gap only)");
228 fprintf ( stdout, "\n +remove_seq unique...Remove all multiple occurences except the first");
229 fprintf ( stdout, "\n +thread_profile_on_msa <file>");
230 fprintf ( stdout, "\n .....................Threads a list of profiles on corresponding seq");
231 fprintf ( stdout, "\n .....................File: >seqname _R_ <msa file> [nlines]");
233 fprintf ( stdout, "\n +thread_dna_on_prot_aln");
234 fprintf ( stdout, "\n .....................-in DNA.seq and -in2 AA.aln");
235 fprintf ( stdout, "\n +thread_struc_on_aln");
236 fprintf ( stdout, "\n .....................-in structure and -in2 aln");
237 fprintf ( stdout, "\n +use_cons............Use the consensus for n[SWITCH]");
238 fprintf ( stdout, "\n +upper.n|[n1-n2].....n omitted sets everything to upper case");
239 fprintf ( stdout, "\n .....................To use n: provide a number_aln via:");
240 fprintf ( stdout, "\n .....................-struc_in <number_file> -struc_in_f number_aln");
241 fprintf ( stdout, "\n .....................if use_cons is set n, is read on the cons");
242 fprintf ( stdout, "\n .....................n: will upper every residue with a value of n in struc_in");
243 fprintf ( stdout, "\n .....................[n1-n2]: upper residues between n1 and n2");
244 fprintf ( stdout, "\n +lower n|[n1-n2]....See +upper");
245 fprintf ( stdout, "\n +switchcase n|[n1-n2]See +upper");
246 fprintf ( stdout, "\n +color_residue <seq> <pos> <color> | file");
247 fprintf ( stdout, "\n .....................File: seq_name pos color");
248 fprintf ( stdout, "\n .....................color: 0-9");
249 fprintf ( stdout, "\n +edit_residue <seq> <pos> <edit> | file");
250 fprintf ( stdout, "\n .....................File: seq_name pos color");
251 fprintf ( stdout, "\n .....................edit: upper|lower|symbol");
255 fprintf ( stdout, "\n +keep n|[n1-n2]....Only keep residues that have a score between n1 and n2");
257 fprintf ( stdout, "\n +invert..............Inverts the sequences: CAT => TAC");
258 fprintf ( stdout, "\n +rotate name Rotate an MSA, names each sequence name_col#");
259 fprintf ( stdout, "\n +convert n|[n1-n2] s1 s2 ....");
260 fprintf ( stdout, "\n +merge_annotation.... ");
262 fprintf ( stdout, "\n .....................Converts residues with your alignment");
263 fprintf ( stdout, "\n .....................similar to upper");
264 fprintf ( stdout, "\n .....................s1: ABCDe turns every ABCD into e");
265 fprintf ( stdout, "\n .....................s1: #e turns any residue into e");
266 fprintf ( stdout, "\n aln2short_aln L C S..Turns sequences into shorter sequences");
267 fprintf ( stdout, "\n .....................L: list of residues to keep");
268 fprintf ( stdout, "\n .....................S: Size of Streches replaced by symbol C");
271 fprintf ( stdout, "\n +random n l..........Generates N random sequences of len l");
272 fprintf ( stdout, "\n .....................You must provide a file with -in");
273 fprintf ( stdout, "\n +count n|[n1-n2] s1 s2....");
274 fprintf ( stdout, "\n .....................Counts residues with your alignment");
275 fprintf ( stdout, "\n .....................similar to convert");
276 fprintf ( stdout, "\n +print_format........prints the format name");
277 fprintf ( stdout, "\n +keep_name...........Keep the original sequence name on extraction");
279 fprintf ( stdout, "\n +remove_aa pos Ml Ncycle Random_len");
280 fprintf ( stdout, "\n .....................Randomly modifies an alignment");
281 fprintf ( stdout, "\n .....................pos=0: chosen randomly");
282 fprintf ( stdout, "\n .....................MaxLen of the deletions, Ncycle: number of cycles");
283 fprintf ( stdout, "\n .....................Random_len: 0 sets the len to maxlen, 1 to a random value");
284 fprintf ( stdout, "\n +remove_nuc.x........Remove Position 1, 2 or 3 of every codon");
285 fprintf ( stdout, "\n +evaluate matrix..gop..gep");
286 fprintf ( stdout, "\n .....................Make a similarity evaluation with matrix");
287 fprintf ( stdout, "\n .....................use -output=score_ascii, or score_html.");
288 fprintf ( stdout, "\n .....................You can filter on the values");
289 fprintf ( stdout, "\n +evaluate matrix..gop..gep");
290 fprintf ( stdout, "\n .....................Make an SP evaluation with matrix");
291 fprintf ( stdout, "\n .....................Uses Natural Gap penalties");
292 fprintf ( stdout, "\n .....................gop and gep must be negative");
293 fprintf ( stdout, "\n .....................use -output=color_ascii, color_html to get a color display");
295 fprintf ( stdout, "\n.....+evaluate_lat........Make a lateral evaluation with matrix");
296 fprintf ( stdout, "\n +msa_weight proc.....Computes weights using the procedure");
297 fprintf ( stdout, "\nRNA analysis Post Processing___________________________________________________");
298 fprintf ( stdout, "\n +aln2alifold.........Turns the MSA into a consensus structure");
299 fprintf ( stdout, "\n +add_alifold.........adds an alifold consensus structure");
301 fprintf ( stdout, "\n +alifold2analyze.mode..mode=stat_cache_list_aln_color_html_ps_usegap");
302 fprintf ( stdout, "\n .......................stat: compile Number of compensated mutations");
303 fprintf ( stdout, "\n .......................cache: ascii-code compensated mutations on aln");
304 fprintf ( stdout, "\n .......................html: color-code compensated mutations on aln");
305 fprintf ( stdout, "\n .......................aln: mark compensated mutations on stockholm aln");
306 fprintf ( stdout, "\n .......................usegap: do not ignore positions with gaps");
308 fprintf ( stdout, "\n +RNAfold_cmp.........compares the sec struc of in1 and in2 (computes them with alifold if missing)");
310 fprintf ( stdout, "\nMSA Post Processing___________________________________________________");
311 fprintf ( stdout, "\n +force_aln filename|seq1 res1 seq2 res2");
312 fprintf ( stdout, "\n .....................Forces residue 1 of seq1 to be aligned with res2 of seq 2");
313 fprintf ( stdout, "\n .....................In a file, there must be one pair of interaction/line");
314 fprintf ( stdout, "\n +sim_filter[_aln_Ix_iy_Cz_cw <seq>");
315 fprintf ( stdout, "\n ....................._<unaln or aln>, aln is assumed");
316 fprintf ( stdout, "\n ....................._I max identity to seq");
317 fprintf ( stdout, "\n ....................._i min identity to seq");
318 fprintf ( stdout, "\n ....................._C max cov on seq");
319 fprintf ( stdout, "\n ....................._c min cov on seq");
320 fprintf ( stdout, "\n +trim[_aln_%%%%50_n111_N50_T_Fn_fS_pS_max_sim_P0_K0] [string2]");
321 fprintf ( stdout, "\n ....................._<seq or aln>, aln is assumed");
322 fprintf ( stdout, "\n ....................._%%%%<max/min_percent_similarity>");
323 fprintf ( stdout, "\n ....................._max Or _min <keep sequences for which sim is the max or the min [Def: _max>");
324 fprintf ( stdout, "\n ....................._cov Or _sim Filter according to the coverage [Def: _sim]");
325 fprintf ( stdout, "\n ....................._n<max_number_of_sequence> ");
326 fprintf ( stdout, "\n ....................._N<percent_of_sequences_to_keep>");
327 fprintf ( stdout, "\n ....................._T Reorder the sequences according to a tree BEFORE triming");
328 fprintf ( stdout, "\n ....................._Fn Keep only sequences that have AT LEAST ONE residue aligned");
329 fprintf ( stdout, "\n ......................in the n first and n last columns. ");
330 fprintf ( stdout, "\n ....................._O<min sim> Remove outlayers that have less than min average sim with other sequences");
331 fprintf ( stdout, "\n ....................._Kn Forces the n top sequences to be kept");
332 fprintf ( stdout, "\n ....................._P_ Print a summary in stderr");
335 fprintf ( stdout, "\n .....................Keeping Sequences: Sequences provided via -in2 will be kept");
337 fprintf ( stdout, "\n .....................Keeping Sequences: Sequences whose name contains <string> in field fS will be kept");
338 fprintf ( stdout, "\n ....................._f<NAME|SEQ|COMMENT> designates a field");
339 fprintf ( stdout, "\n .....................<string> is a Perl regular expression");
340 fprintf ( stdout, "\n +aln2unalign Mode Penalty Threshold");
341 fprintf ( stdout, "\n .....................Identifies all the streches less conserved than than the average");
342 fprintf ( stdout, "\n .....................Mode: lower|number|unalign Act on all the resiues withs score<Thres");
343 fprintf ( stdout, "\n .....................Penalty: FSA penalty align2unalign, Def=90");
344 fprintf ( stdout, "\n .....................Threshold: Fraction of unaligned residues(0-9) Def=2");
346 fprintf ( stdout, "\n +clean_cdna..........Undocumented");
347 fprintf ( stdout, "\n +clean_maln..........Undocumented");
348 fprintf ( stdout, "\nTree Analysis___________________________________________________");
351 fprintf ( stdout, "\n +tree_prune..........Prune the tree -in using the sequences provided via -in2");
352 fprintf ( stdout, "\n +tree_cmp............Compares the tree -in and the tree -in2");
353 fprintf ( stdout, "\n +tree_cmp_list......Compares the tree -in and the tree_list -in2");
354 fprintf ( stdout, "\n .....................Sets the support as boostrap value in the -in tree");
356 fprintf ( stdout, "\n .....................-in and -in2 can contain different taxons");
357 fprintf ( stdout, "\n +tree_scan.P1..P2.....scans alignment <-in> with tree <-in2>)");
358 fprintf ( stdout, "\n ......................+tree_scan help to get P1 information");
359 fprintf ( stdout, "\n ......................+aln2tree help to get P2 information");
361 fprintf ( stdout, "\n .....................-in and -in2 can contain different taxons");
362 fprintf ( stdout, "\n +tree2node.......... Reports the node list along with the split");
363 fprintf ( stdout, "\n ..................... splits can be described with the seq order ");
364 fprintf ( stdout, "\n ..................... provided via -in3=<sequence> ");
366 fprintf ( stdout, "\n +treelist2groups.N....count all topologies within a list of trees");
367 fprintf ( stdout, "\n .....................-in is in fasta format with each name being a newick file");
368 fprintf ( stdout, "\n .....................-in2 can be a list of sequences used to trim the trees");
369 fprintf ( stdout, "\n ......................N can be used to unresolve the trees with Depth N");
370 fprintf ( stdout, "\n +treelist2lti.N.C.....Reports the average stability of each sequence neighborhood");
371 fprintf ( stdout, "\n ......................Species can be selected via -in2 [Fasta file with Taxon names]");
372 fprintf ( stdout, "\n ......................OR the sequences observed in C%% of the files are kept [Def: C=100]");
375 fprintf ( stdout, "\n +treelist2seq.C.......Reports the species observed in C%% of the trees");
376 fprintf ( stdout, "\n +treelist2splits......List and counts all the splits in a list of trees");
377 fprintf ( stdout, "\n ......................splits can be restricted to a list of sequences provided via -in2");
378 fprintf ( stdout, "\n +treelist2dmat.......outputs a distance matrix for a list of trees");
380 fprintf ( stdout, "\n +tree_compute n s....Computes a tree using the MSA provided with -in");
381 fprintf ( stdout, "\n ....................n:0-9, controls the way the MSA is filtered");
382 fprintf ( stdout, "\n ....................s:pam250mt|blosum62mt|categories|enthropy");
383 fprintf ( stdout, "\n ....................s:controls the column evaluation in MSA");
384 fprintf ( stdout, "\n +change_distances.f.f:float, sets all the distances to f in the tree");
385 fprintf ( stdout, "\n +change_bootstrap n..:n=0 removes all the bootstrap values");
386 fprintf ( stdout, "\n .....................:n!=0 adds a the value n to every node");
387 fprintf ( stdout, "\n +tree2dpatree........Replaces tree distances with the minimum %%ID in");
388 fprintf ( stdout, "\n .....................the depending subgroup. The ID is measured on an");
389 fprintf ( stdout, "\n .....................-in=TREE -in2=ALN");
390 fprintf ( stdout, "\n +unroot..............Removes the root in the input tree");
391 fprintf ( stdout, "\n +tree2group.N.I.P....Reports all the tree subgroup with at most Nseq");
392 fprintf ( stdout, "\n .....................and at min I%% identity. Output format can be read by");
393 fprintf ( stdout, "\n .....................collapse_tree. New groups are named P_1, P_2...");
394 fprintf ( stdout, "\n +collapse_tree.F.....Collapses trees. F is either a file or a list");
395 fprintf ( stdout, "\n .....................<new name> <seq1> <seq2>...");
396 fprintf ( stdout, "\n +aln2tree............Computes a tree");
397 fprintf ( stdout, "\n ..ktupN|aln|sarmat ktupN: match size N to estimate distances");
398 fprintf ( stdout, "\n .....................aln: Measures distances on aln");
399 fprintf ( stdout, "\n .....................sarmat: expects in to be a SAR matrix of O and I");
400 fprintf ( stdout, "\n ..nj | cw............Runs Neighbor Joining OR Cw to compute Tree");
401 fprintf ( stdout, "\n ..dpa................Turns the tree into a daptree (+tree2dpatree)");
402 fprintf ( stdout, "\n +node_sort..<name>...Sort leafs of tree n1, by node distance");
405 fprintf ( stdout, "\nMatrix Analysis___________________________________________________");
406 fprintf ( stdout, "\n +aln2mat_diaa........computes a dinucleotide matrix on a list of aln");
407 fprintf ( stdout, "\n +aln2mat.............computes a log odd matrix");
409 fprintf ( stdout, "\n +seq2lat_mat.........computes a transition matrix on seq provided via -in");
411 fprintf ( stdout, "\nStructure Analysis___________________________________________________");
412 fprintf ( stdout, "\n +struc2contacts.A.B D.Displays in capitals all the residues of A");
413 fprintf ( stdout, "\n ......................Less than D Angs from a residue of B");
414 fprintf ( stdout, "\n ......................A and B are pdb file, D is a distance in Angs");
415 fprintf ( stdout, "\n +seq2contacts.A.D.....Identifies all the residues in contact with ligands");
416 fprintf ( stdout, "\n ......................Ligands are in the FASTA header of struc in");
417 fprintf ( stdout, "\n ......................>Name _S_ [Target Struc] [Ligand1] [Chain] ...");
418 fprintf ( stdout, "\n ......................Output: number_fasta: 0=no contact, 1=ligand 1...");
419 fprintf ( stdout, "\n ......................9: residues in contact with more than 1 ligand");
420 fprintf ( stdout, "\n ......................Use -output=color_html/ascii to display result");
421 fprintf ( stdout, "\n +struc2nb...D.........Display a list of all the residues D appart");
422 fprintf ( stdout, "\n +rm_template...V......Removes _[S|G|R]_[template] to sequence names");
423 fprintf ( stdout, "\n ......................V: omitted | sequences <=> Output sequences");
424 fprintf ( stdout, "\n ......................V: template <=> Output templates");
426 fprintf ( stdout, "\n +add_template.F.......Add _[S|G|R]_[template] to sequence names");
427 fprintf ( stdout, "\n ......................F can either be a fasta file or an executable");
428 fprintf ( stdout, "\n ......................F: File: >name _S_ template");
429 fprintf ( stdout, "\n ......................F: executable: pg -infile=<seq> -outfile=<tagged>");
430 fprintf ( stdout, "\nMatrix Comparison___________________________________________________");
431 fprintf ( stdout, "\n +mat2cmp...............Returns the correlation coefficient between two matrices");
432 fprintf ( stdout, "\n .......................-in mat1 -input matrix, -in2 mat2 -input2 matrix");
433 fprintf ( stdout, "\n*********** INPUT FORMATS: Alignments *****************");
434 fprintf ( stdout, "\n AUTOMATIC RECOGNITION");
435 fprintf ( stdout, "\n perl_xxx:............. runs xxx onto the input file");
436 fprintf ( stdout, "\n xxxx <file> > outfile..xxx reads any formats, outputs fasta");
437 fprintf ( stdout, "\n amps_aln saga_aln ");
438 fprintf ( stdout, "\n clustal_aln fasta_aln msf_aln ");
439 fprintf ( stdout, "\n dali_aln gotoh_aln pima_aln");
440 fprintf ( stdout, "\n dialign_aln matrix conc_aln");
441 fprintf ( stdout, "\n NON AUTOMATIC RECOGNITION (use the -input file to specify the format");
442 fprintf ( stdout, "\n number_aln newick_tree");
443 fprintf ( stdout, "\n");
444 fprintf ( stdout, "\n*********** INPUT FORMATS: Sequences *****************");
445 fprintf ( stdout, "\n fasta_seq dali_seq pir_seq");
446 fprintf ( stdout, "\n barton_list_tc amps_sd_scores EST_fasta");
447 fprintf ( stdout, "\n gor_seq gor_struc number_fasta[*]");
448 fprintf ( stdout, "\n swissprot tc_lib pdb_struc");
449 fprintf ( stdout, "\n");
450 fprintf ( stdout, "\n*********** INPUT FORMATS: Structures *****************");
451 fprintf ( stdout, "\n rna_number");
452 fprintf ( stdout, "\n alifold");
453 fprintf ( stdout, "\n*********** OUTPUT FORMATS: Alignments ******************");
454 fprintf ( stdout, "\n compressed_aln saga_aln clustal_aln");
455 fprintf ( stdout, "\n phylip_aln msf_aln fasta_aln ");
456 fprintf ( stdout, "\n pir_aln ");
457 fprintf ( stdout, "\n color_html,color_ps......colored using the struc_in file ");
458 fprintf ( stdout, "\n color_protogene..........colors codons");
459 fprintf ( stdout, "\n color_exoset.............mixes conservation (gray) and introns (RGB)");
460 fprintf ( stdout, "\n color_pdf pw_lib_saga_aln tdna_aln");
461 fprintf ( stdout, "\n thread_dna_on_prot_aln");
462 fprintf ( stdout, "\n");
463 fprintf ( stdout, "\n*********** OUTPUT FORMATS: sequence ******************");
464 fprintf ( stdout, "\n fasta_seq fasta_seq1 gotoh_seq");
465 fprintf ( stdout, "\n gor_seq cache_id");
466 fprintf ( stdout, "\n tblastx_db1 tblastx_db2 tblastx_db3");
467 fprintf ( stdout, "\n*********** OUTPUT FORMATS: weights ******************");
468 fprintf ( stdout, "\n constraints saga_pw_sd_weights nseq\n");
469 fprintf ( stdout, "\n");
470 fprintf ( stdout, "\n*********** OUTPUT Formats: special ****************");
471 fprintf ( stdout, "\n len name statistics<_hnrglNL>");
472 fprintf ( stdout, "\n sim............outputs a similarity matrix based on an id comparison of -in");
473 fprintf ( stdout, "\n sim_sarmat.....in is sar matrix");
474 fprintf ( stdout, "\n sim_idscore....makes dp alignment of the sequences using Blosum62mt");
475 fprintf ( stdout, "\n sim_idscoreDNA.makes dp alignment of the sequences using idmat");
476 fprintf ( stdout, "\n sim............if -in2 is set: in1 vs in2, idscore");
478 fprintf ( stdout, "\n code_name......Outputs a compact list of names for code/decode");
482 fprintf ( stdout, "\n");
485 fprintf ( stdout, "\n");
489 argv=standard_initialisation (argv, &argc);
492 for ( a=1; a< argc; a++)
494 if (a==1 && argv[1][0]!='-')
496 sprintf( in_file, "%s", argv[a]);
498 else if ( strcmp ( argv[a], "-in_f")==0 ||strm(argv[a],"-input") )
500 if ( strcmp ( argv[a], "-in_f")==0) fprintf ( stdout,"\nWARNING: %s deprecated, use -input instead", argv[a]);
502 sprintf ( in_format, "%s", argv[a+1]);
506 else if ( strcmp ( argv[a], "-cache")==0 )
508 sprintf (cache, "%s", argv[a+1]);
514 else if ( strcmp ( argv[a], "-exon_boundaries")==0 )
517 set_string_variable ("exon_boundaries", argv[a+1]);
520 else if ( strcmp ( argv[a], "-overaln_threshold")==0 )
523 set_int_variable ("overaln_threshold", atoi(argv[a+1]));
526 else if ( strcmp ( argv[a], "-overaln_target")==0 )
529 set_int_variable ("overaln_target", atoi(argv[a+1]));
532 else if ( strcmp ( argv[a], "-overaln_P1")==0 )
535 set_int_variable ("overaln_P1", atoi(argv[a+1]));
538 else if ( strcmp ( argv[a], "-overaln_P2")==0 )
541 set_int_variable ("overaln_P2", atoi(argv[a+1]));
544 else if ( strcmp ( argv[a], "-overaln_P3")==0 )
547 set_int_variable ("overaln_P3", atoi(argv[a+1]));
550 else if ( strcmp ( argv[a], "-overaln_P4")==0 )
553 set_int_variable ("overaln_P4", atoi(argv[a+1]));
557 else if ( strcmp ( argv[a], "-in2_f")==0||strm(argv[a],"-input2") )
559 if ( strcmp ( argv[a], "-in_f")==0) fprintf ( stdout,"\nWARNING: %s deprecated, use -input2 instead", argv[a]);
561 sprintf ( in2_format, "%s", argv[a+1]);
564 else if ( strcmp ( argv[a], "-seqnos")==0)
566 sprintf (action_list[n_actions++], "seqnos");
569 else if ( strcmp( argv[a], "-action")==0)
571 while ((a+1)<argc && argv[a+1][0]!='-')
573 sprintf (action_list[n_actions++], "%s", argv[a+1]);
577 else if ( strcmp ( argv[a], "-keep_case")==0)
579 if(!NEXT_ARG_IS_FLAG)RAD->keep_case=1;
580 else RAD->keep_case=(strm3(argv[a], "on","ON","On"))?1:0;
584 else if ( strcmp ( argv[a], "-conv")==0)
586 if ( strncmp ( argv[a+1],"set",3)==0)RAD->symbol_list=make_symbols (argv[++a],&(RAD->n_symbol));
589 RAD->symbol_list=declare_char (STRING, STRING);
590 while(!NEXT_ARG_IS_FLAG)
592 sprintf ( RAD->symbol_list[RAD->n_symbol], "%s", argv[++a]);
597 else if ( strcmp ( argv[a], "-struc_in_f")==0 ||strcmp ( argv[a], "-input3")==0 )
599 sprintf ( struc_in_format, "%s", argv[a+1]);
602 else if ( strcmp ( argv[a], "-out_f")==0 ||strm(argv[a],"-output") )
604 if ( strcmp ( argv[a], "-out_f")==0) fprintf (stdout, "\nWARNING: %s deprecated, use -output instead", argv[a]);
605 sprintf ( out_format, "%s", argv[a+1]);
608 else if ( strm ( argv[a], "-struc_out_f") || strm ( argv[a], "-output_struc") )
610 sprintf ( struc_out_format, "%s", argv[a+1]);
613 else if ( strcmp (argv[a],"-in")==0)
615 sprintf( in_file, "%s", argv[a+1]);
618 else if ( strcmp (argv[a],"-rename")==0)
620 sprintf( rename_file, "%s", argv[a+1]);
623 else if ( strcmp (argv[a],"-code")==0)
626 sprintf( rename_file, "%s", argv[a+1]);
629 else if ( strcmp (argv[a],"-decode")==0)
632 sprintf( rename_file, "%s", argv[a+1]);
635 else if ( strcmp (argv[a],"-in2")==0)
637 sprintf( in2_file, "%s", argv[a+1]);
640 else if ( strcmp (argv[a],"-coor")==0)
642 sprintf( RAD->coor_file, "%s", argv[a+1]);
645 else if (strcmp (argv[a],"-out")==0)
647 sprintf (out_file, "%s", argv[a+1]);
650 else if (strcmp (argv[a],"-out2")==0)
652 sprintf (out2_file, "%s", argv[a+1]);
655 else if ( strcmp (argv[a],"-struc_in")==0 || strcmp (argv[a],"-in3")==0 )
657 sprintf( struc_in_file, "%s", argv[a+1]);
660 else if (strcmp (argv[a],"-struc_out")==0)
662 sprintf (struc_out_file, "%s", argv[a+1]);
665 else if ( strcmp ( argv[a], "-rm_gap")==0)
669 else if ( strcmp ( argv[a], "-print_format")==0)
673 else if ( strcmp ( argv[a], "-no_warning")==0)
675 set_warning_mode (NO);
680 fprintf ( stdout, "\nUNKNOWN OPTION: %s", argv[a]);
681 myexit(EXIT_FAILURE);
684 /****************************************************************/
686 /* Data Preparation */
689 /****************************************************************/
691 prepare_cache (cache);
692 /****************************************************************/
697 /****************************************************************/
700 if ( strm (out_format, "hasch"))
702 fprintf ( stdout, "%d\n", (int)hash_file(in_file));
708 rename_list=read_rename_file ( rename_file,code);
712 if ((D1=read_data_structure (in_format, in_file,RAD))!=NULL)
714 in_format=(in_format && in_format[0])?in_format:identify_seq_format(in_file);
716 if (print_format)fprintf ( stdout, "\nFILE:%s FORMAT:%s\n", in_file, in_format);
718 else if ( in_file[0])
720 fprintf ( stdout, "\nFORMAT of file %s Not Supported[FATAL:%s]\n", in_file, PROGRAM);
721 myexit(EXIT_FAILURE);
724 if ((D2=read_data_structure (in2_format, in2_file,RAD))!=NULL){if (print_format)fprintf ( stderr, "\nFILE:%s FORMAT:%s\n", in2_file, (in2_format&&in2_format[0])?in2_format:identify_seq_format(in2_file));}
726 else if (!D2 && in2_file[0])
728 fprintf ( stderr, "\nFORMAT of file %s Not Supported [FATAL:%s]\n", in2_file, PROGRAM);
729 myexit(EXIT_FAILURE);
735 if ((D_ST=read_data_structure (struc_in_format, struc_in_file,RAD)))
745 while ((entry=extract_entry (CL)))
747 if ( D_ST->S)(D_ST->S)->seq[entry[SEQ1]][entry[R1]-1]=entry[WE];
749 thread_seq_struc2aln (D_ST->A, D_ST->S);
751 else if ( name_is_in_list ("cons", ((D_ST)->A)->name, ((D_ST)->A)->nseq, 100));
754 D_ST->A=copy_aln ( D1->A, D_ST->A);
756 thread_seq_struc2aln (D_ST->A, D_ST->S);
759 else if ((strcmp (struc_in_format, "rna_number")==0) && in_file[0])
761 D_ST->RNA_ST=read_rna_struc_number((D1->A),struc_in_file);
763 else if ( struc_in_format[0] && struc_in_file[0])
766 fprintf ( stderr, "\nSTRUC %s UNKNOWN[FATAL]", struc_in_format);
767 myexit(EXIT_FAILURE);
771 D_ST=vcalloc ( 1, sizeof (Sequence_data_struc));
774 action=declare_char(100, 100);
775 for ( a=0; a< n_actions;)
777 if (action_list[a][0]!='+')
779 fprintf ( stderr, "\nWARNING: Action %s Unknown. Actions start with a +", action_list[a]);
780 myexit (EXIT_FAILURE);
785 sprintf ( action[b++], "%s", action_list[a++]+1);
786 while ( a<n_actions && action_list[a][0]!='+')sprintf ( action[b++], "%s", action_list[a++]);
787 modify_data( D1, D2, D_ST, action,b, RAD);
793 if (D1)D1->A= rename_seq_in_aln(D1->A, rename_list);
794 if (D2)D2->A=rename_seq_in_aln (D2->A, rename_list);
795 if (D_ST)D_ST->A=rename_seq_in_aln (D_ST->A,rename_list);
797 if (D1)D1->T =rename_seq_in_tree (D1->T, rename_list);
798 if (D2)D2->T =rename_seq_in_tree (D2->T, rename_list);
799 if (D_ST)D_ST->T=rename_seq_in_tree (D_ST->T,rename_list);
803 if ( !out_format[0] && ! struc_out_format[0])sprintf ( out_format, "%s", (in_format && in_format[0])?in_format:"fasta_aln");
804 main_output ( D1, D2, D_ST, out_format, out_file);
805 main_output ( D1, D2, D_ST, struc_out_format, struc_out_file);
812 /**************************************************************************************************/
813 /***************************** FORMAT GUESSING ******************************************/
814 /**************************************************************************************************/
815 Sequence_data_struc *read_data_structure ( char *in_format, char *in_file, Action_data_struc *RAD)
818 Sequence_data_struc *D;
819 char **seq_name=NULL, **sequences=NULL;
823 D=vcalloc ( 1, sizeof (Sequence_data_struc));
826 if (!in_file[0])return NULL;
829 in_format=identify_seq_format(in_file);
831 if (!in_format[0])return NULL;
835 D->A=declare_Alignment(NULL);
836 if ( RAD->keep_case)(D->A)->residue_case=KEEP_CASE;
838 D->rm_gap=RAD->rm_gap;
839 sprintf ( D->format, "%s", in_format);
840 sprintf ( D->file, "%s", in_file);
845 if ( strm2(in_format,"saga_aln","clustal_aln"))
847 read_aln (in_file, D->A);
852 else if ( strm (in_format, "treefile_list"))
855 D->S=get_tree_file_list(in_file);
856 D->A=seq2aln(D->S, D->A,NO_PAD);
858 else if ( strm (in_format, "file_list") || strm (in_format, "list"))
860 D->S=get_file_list(in_file);
861 D->A=seq2aln(D->S, D->A,KEEP_GAP);
863 else if ( strm (in_format, "fasta_tree"))
866 D->S=get_fasta_tree (in_file, NULL);
867 D->A=seq2aln(D->S, D->A,NO_PAD);
870 else if ( strm (in_format, "tree_list") || strm (in_format, "treelist"))
878 seq_file=vtmpnam(NULL);
879 seq=vfopen (seq_file, "w");
880 line=file2lines (in_file);
881 fp=vfopen (seq_file, "w");
882 for ( n=1; n<atoi(line[0]); n++)
884 fprintf ( fp, ">Tree_%d\n%s\n", n,line[n]);
888 free_char (line, -1);
889 return read_data_structure ( "fasta_tree",seq_file,RAD);
892 else if (strm (in_format, "matrix"))
894 D->M=read_matrice (in_file);
896 else if (strm4 (in_format, "newick_tree", "newick", "nh", "new_hampshire"))
898 D->T=main_read_tree (in_file);
899 D->S=tree2seq(D->T, NULL);
900 D->A=seq2aln (D->S,D->A, 0);
902 else if (strm (in_format, "blast_aln"))
904 if (read_blast_aln (in_file, D->A))
913 else if ( strm( in_format,"number_aln"))
915 read_number_aln (in_file, D->A);
918 else if ( strm( in_format,"stockholm_aln"))
920 read_stockholm_aln (in_file, D->A);
923 else if ( strm( in_format,"gotoh_aln"))
925 read_gotoh_aln (in_file, D->A);
929 else if ( strm ( in_format, "msf_aln"))
931 read_msf_aln (in_file, D->A);
934 else if ( strm ( in_format, "amps_aln"))
936 read_amps_aln (in_file, D->A);
939 else if ( strm (in_format, "excel_seq"))
941 D->S=perl_reformat2fasta ("excel2fasta.pl",in_file);
942 (D->S)->contains_gap=0;
943 D->A=seq2aln(D->S, D->A,RAD->rm_gap);
945 else if ( strm (in_format, "pavie_seq"))
947 D->S=perl_reformat2fasta ("pavie2fasta.pl",in_file);
948 (D->S)->contains_gap=0;
949 D->A=seq2aln(D->S, D->A,RAD->rm_gap);
951 else if ( strncmp (in_format, "perl_",5 )==0)
953 D->S=perl_reformat2fasta (in_format+5,in_file);
954 (D->S)->contains_gap=0;
955 D->A=seq2aln(D->S, D->A,RAD->rm_gap);
957 else if ( strm (in_format, "number_fasta"))
959 D->S=get_fasta_sequence_num (in_file, NULL);
960 (D->S)->contains_gap=0;
961 D->A=seq2aln(D->S, D->A,RAD->rm_gap);
963 else if ( strm (in_format, "raw_fasta"))
965 D->S=get_fasta_sequence_raw (in_file, NULL);
966 (D->S)->contains_gap=0;
967 D->A=seq2aln(D->S, D->A,RAD->rm_gap);
970 else if ( strm2 (in_format, "fasta_aln", "fasta_seq"))
973 D->S=get_fasta_sequence (in_file, NULL);
974 if ( strcmp (in_format, "fasta_aln")==0)(D->S)->contains_gap=0;
975 D->A=seq2aln(D->S, D->A,RAD->rm_gap);
977 else if ( strm (in_format, "fasta_tree"))
980 D->S=get_fasta_tree (in_file, NULL);
981 D->A=seq2aln(D->S, D->A, NO_PAD);
984 else if ( strm (in_format, "pdb") || strm (in_format, "pdb_struc"))
986 D->S=get_pdb_sequence (in_file);
989 add_warning (stderr, "FAILED TO find PDB File %s", in_file);
990 myexit (EXIT_FAILURE);
992 D->A=seq2aln(D->S, D->A,RAD->rm_gap);
994 else if ( strm2(in_format, "pir_seq", "pir_aln"))
996 D->S=get_pir_sequence ( in_file,NULL );
997 seq2aln(D->S, D->A, RAD->rm_gap);
999 else if ( strm(in_format, "gor_seq") )
1001 D->S=get_gor_sequence ( in_file,NULL );
1002 seq2aln(D->S, D->A, RAD->rm_gap);
1004 else if ( strm2 ( in_format, "dali_aln", "dali_seq"))
1006 D->S=get_sequence_dali ( in_file);
1007 seq2aln(D->S, D->A, RAD->rm_gap);
1009 else if ( strm (in_format, "barton_list_tc"))
1011 get_barton_list_tc_seq ( in_file);
1013 else if ( strm (in_format, "amps_sd_scores"))
1015 D->W=get_amps_sd_scores ( in_file);
1018 else if ( strm ( in_format, "pima_aln"))
1020 D->S=get_pima_sequence ( in_file);
1021 seq2aln (D->S, D->A, RAD->rm_gap);
1023 else if ( strm( in_format, "gor_struc"))
1025 D->S=get_struc_gor ( in_file);
1026 seq2aln(D->S, D->A, RAD->rm_gap);
1028 else if ( strm( in_format, "dialign_aln"))
1030 D->S=get_dialign_sequence ( in_file);
1031 seq2aln (D->S, D->A, RAD->rm_gap);
1033 else if ( strm( in_format, "tc_lib") || strm( in_format, "mocca_lib") || strm( in_format, "lib"))
1035 read_seq_in_list (in_file,&nseq,&sequences,&seq_name);
1036 D->S=fill_sequence_struc ( nseq, sequences, seq_name);
1037 D->CL=declare_constraint_list ( D->S,NULL, NULL, 0,NULL, NULL);
1038 D->CL=read_constraint_list_file(D->CL,in_file);
1039 seq2aln (D->S, D->A, RAD->rm_gap);
1040 free_char (sequences,-1);
1041 free_char (seq_name, -1);
1043 else if ( strm( in_format,"swissprot_seq"))
1045 D->S=get_swissprot_sequence ( in_file,NULL);
1046 seq2aln (D->S, D->A, RAD->rm_gap);
1048 else if (strm (in_format, "alifold"))
1050 D->S=read_alifold ( in_file);
1051 seq2aln (D->S, D->A,0);
1060 for ( a=0; a<(D->A)->nseq; a++)sprintf ( (D->A)->file[a], "%s", in_file);
1064 for ( a=0; a<(D->A)->nseq; a++)sprintf ( (D->S)->file[a], "%s", in_file);
1069 Sequence *read_sequences (char *name)
1071 return main_read_seq (name);
1073 Alignment * alifold2aln (char *file)
1076 S=read_alifold(file);
1077 sprintf ( S->seq[0],"%s", S->seq[1]);
1078 return seq2aln (S, NULL, 0);
1080 Sequence * read_alifold (char *file)
1085 S=declare_sequence (1,count_n_char_in_file (file),2);
1086 list=file2lines (file);
1090 substitute (S->seq[0], "\n", "\0");
1091 substitute (S->seq[0], " ", "\0");
1092 substitute (S->seq[0], "_", STOCKHOLM_STRING);
1093 l=strlen (S->seq[0]);
1094 substitute (S->seq[1], "\n", "\0");
1095 substitute (S->seq[1], " ", "\0");
1096 substitute (S->seq[1], ".", STOCKHOLM_STRING);
1098 sprintf (S->name[0], "cons");
1099 sprintf (S->name[1], "#=GC SS_cons");
1108 Sequence * main_read_seq ( char *name)
1116 format=identify_seq_format (name);
1119 if ( getenv4debug ("DEBUG_REFORMAT"))fprintf ( stderr, "\n\nFormat %s\n", format);
1122 if (format &&strm(format, "fasta_seq"))
1124 S= get_fasta_sequence ( name, NULL);
1126 else if (format &&strm(format, "pir_seq")) S= get_pir_sequence ( name, NULL);
1127 else if (format &&strm(format,"swissprot_seq"))S= get_swissprot_sequence (name, NULL);
1128 else if (format && strstr (format, "aln"))
1130 A=main_read_aln ( name, NULL);
1135 else if ( format && strstr (format, "tc_lib"))
1138 char **sequences=NULL, **seq_name=NULL;
1140 read_seq_in_list (name,&nseq,&sequences,&seq_name);
1141 S=fill_sequence_struc ( nseq, sequences, seq_name);
1142 for ( b=0; b< S->nseq; b++)sprintf ( S->file[b], "%s",name);
1143 free_char (seq_name, -1);free_char (sequences, -1);
1147 /*Use The ClustalW routine*/
1148 S=cw_read_sequences (name);
1151 for ( a=0; a<S->nseq; a++)sprintf ( S->file[a], "%s", name);
1154 S=clean_sequence ( S);
1158 Alignment * main_read_aln ( char *name, Alignment *A)
1162 static char *format;
1167 if ( !name)return NULL;
1168 else if (!check_file_exists(name))
1170 if ( !check_file_exists (name+1))return NULL;
1171 else if ( name[0]=='A') name++;
1172 else if ( name[0]=='S') name++;/*Line Added for the -convert flag of T-Coffee*/
1176 if (!A)A=declare_aln(NULL);
1177 format=identify_seq_format (name);
1181 if ((format && strm(format, "saga_aln" )) ||strm(format, "clustal_aln")||strm(format, "t_coffee_aln" ) )
1184 read_aln ( name, A);
1187 else if (format && strm (format, "conc_aln"))A=input_conc_aln (name,NULL);
1188 else if (format &&strm(format, "msf_aln" ))read_msf_aln ( name, A);
1189 else if (format &&strm(format, "blast_aln"))read_blast_aln (name, A);
1190 else if (format &&(strm(format, "fasta_aln")))
1194 S=get_fasta_sequence ( name, NULL);
1199 else if (format &&strm(format, "pir_aln"))
1201 S=get_pir_sequence ( name, NULL);
1205 else if (format && strm(format, "fasta_seq") && A)
1207 S=get_fasta_sequence ( name, NULL);
1209 for ( a=1; a<S->nseq; a++)if ( strlen (S->seq[a-1])!=strlen (S->seq[a])){free_sequence (S, S->nseq); free_aln (A); return NULL;}
1214 else if (format && strm(format, "pir_seq") && A)
1216 S=get_pir_sequence ( name, NULL);
1218 for ( a=1; a<S->nseq; a++)if ( strlen (S->seq[a-1])!=strlen (S->seq[a])){free_sequence (S, S->nseq); free_aln (A); return NULL;}
1229 if ( check_list_for_dup( A->name, A->nseq))
1231 fprintf ( stderr, "\nWARNING (main_read_aln): %s is duplicated in File %s ", check_list_for_dup( A->name, A->nseq), A->file[0]);
1232 A=aln2unique_name_aln(A);
1235 if (IN_SEQ)A->S=IN_SEQ;
1236 else if (!A->S){A->S=aln2seq(A);}
1238 A->S=ungap_seq(A->S);
1239 A=fix_aln_seq(A, A->S);
1241 for ( a=0; a< A->nseq; a++) sprintf ( A->file[a], "%s", name);
1248 char * identify_aln_format ( char *file)
1250 /*This function identify known sequence and alignmnent formats*/
1251 return identify_seq_format (file);
1253 char * identify_seq_format ( char *file)
1256 /*This function identify known sequence and alignmnent formats*/
1258 if ( format==NULL)format=vcalloc ( 100, sizeof (char));
1259 else format[0]='\0';
1263 if ( !check_file_exists(file))
1265 fprintf (stderr, "ERROR: %s Does Not Exist [FATAL:%s]\n",file, PROGRAM);
1266 myexit (EXIT_FAILURE);
1268 else if ( is_stockholm_aln (file))sprintf (format, "stockholm_aln");
1269 else if ( is_blast_file (file))sprintf ( format, "blast_aln");
1270 else if ( is_pdb_file(file))sprintf ( format, "pdb_struc");
1271 else if ( format_is_msf (file))sprintf ( format, "msf_aln");
1272 else if ( format_is_fasta_seq(file))sprintf ( format, "fasta_seq");
1273 else if ( format_is_fasta_aln(file))sprintf ( format, "fasta_aln");
1274 else if ( format_is_pir_aln (file))sprintf ( format, "pir_aln");
1275 else if ( format_is_pir_seq (file))sprintf ( format, "pir_seq");
1276 else if ( format_is_oligo (file))sprintf ( format, "oligo_aln");
1277 else if ( format_is_swissprot (file))sprintf ( format, "swissprot_seq");
1278 else if ( format_is_saga (file))sprintf ( format, "clustal_aln");
1279 else if ( format_is_conc_aln (file))sprintf ( format, "conc_aln");
1280 else if ( is_lib (file))sprintf ( format, "tc_lib");
1281 else if ( is_lib_02 (file))sprintf ( format, "tc_lib_02");
1282 else if ( is_newick(file))sprintf ( format, "newick_tree");
1286 //add_warning ( stderr, "\nThe Format of File: %s was not recognized [SERIOUS:%s]",file, PROGRAM);
1291 char **identify_list_format ( char **list, int n)
1300 declare_name (name);
1301 for ( a=0; a< n; a++)
1304 sprintf (name, "%s", list[a]);
1306 if ((mode=identify_format ( &string))!='?')
1308 sprintf ( name, "%s", string);
1309 sprintf ( list[a], "%c%s", mode,name);
1313 fprintf ( stderr, "\nERROR: %s not recognised [FATAL:%s]", name, PROGRAM);
1322 char * name2type_name ( char *name)
1324 /*turns <file> into <Sfile>, <Afile>...*/
1328 new_name=vcalloc ( strlen (name)+2, sizeof (char));
1329 sprintf ( new_name, "%s", name);
1330 if (is_in_set (name[0], "ALSMXPRW") && !check_file_exists(name))
1332 sprintf ( new_name, "%s", name);
1336 mode=identify_format (&new_name);
1337 sprintf ( new_name, "%c%s", mode,name);
1342 char identify_format (char **fname)
1347 if ((is_in_set (mode, "ALMSPR") && check_file_exists(fname[0]+1)) ||(mode=='X' && is_matrix ( fname[0]+1)) ||(mode=='M' && is_method(fname[0]+1)) )
1352 else if (mode=='W' && !check_file_exists(fname[0])){fname[0]++;}
1356 /*WARNING: Order matters => internal methods can be confused with files, must be checked last*/
1357 if (is_lib(fname[0]))mode='L';
1358 else if (is_pdb_file(fname[0]))mode='P';
1359 else if (is_seq(fname[0]))mode='S';
1360 else if (is_aln(fname[0]))mode='A';
1361 else if (is_matrix(fname[0]))mode='X';
1362 else if (is_method(fname[0]))mode='M';
1370 int is_pdb_name ( char *name)
1375 static char **buf_names;
1376 static int *buf_result;
1384 buf_names=declare_char (1000, 100);
1385 buf_result=vcalloc (1000, sizeof (int));
1387 if ( (result=name_is_in_list ( name, buf_names,nbuf,100))!=-1)return buf_result[result];
1391 result_file=vtmpnam (NULL);
1393 sprintf ( command, "extract_from_pdb -is_pdb_name \'%s\' > %s", name, result_file);
1394 if ( getenv4debug ("DEBUG_EXTRACT_FROM_PDB"))fprintf ( stderr, "\n[DEBUG_EXTRACT_FROM_PDB:is_pdb_name] %s\n", command);
1395 my_system ( command);
1397 fp=vfopen ( result_file, "r");
1398 fscanf ( fp, "%d", &result);
1400 vremove ( result_file);
1402 sprintf ( buf_names[nbuf], "%s", name);
1403 result=buf_result[nbuf++]=(result==1)?1:0;
1409 char* get_pdb_id ( char *file)
1411 /*receives the name of a pdb file*/
1412 /*reads the structure id in the header*/
1413 /*returns the pdb_id*/
1415 char command[10000];
1423 tmp_name=vtmpnam(NULL);
1425 sprintf ( cached, "%s/%s", get_cache_dir(),file);
1426 if ( check_file_exists(cached))sprintf ( fname, "%s", cached);
1427 else sprintf ( fname, "%s", file);
1429 sprintf ( command, "extract_from_pdb -get_pdb_id %s > %s",fname, tmp_name);
1431 if ( getenv4debug ("DEBUG_EXTRACT_FROM_PDB"))fprintf ( stderr, "\n[DEBUG_EXTRACT_FROM_PDB:get_pdb_id] %s\n", command);
1432 my_system ( command);
1435 fp=vfopen (tmp_name, "r");
1436 fscanf ( fp, "\n%s\n", buf);
1439 if ( getenv4debug ("DEBUG_EXTRACT_FROM_PDB"))fprintf ( stderr, "\n[DEBUG_EXTRACT_FROM_PDB:get_pdb_id]DONE\n");
1441 id=vcalloc ( strlen (buf)+1, sizeof (char));
1442 sprintf ( id, "%s", buf);
1450 char* get_pdb_struc(char *in_name, int start, int end)
1453 char command[LONG_STRING];
1459 name=vcalloc ( STRING, sizeof (char));
1460 sprintf ( name, "%s", in_name);
1462 if ( (name1=is_pdb_struc(name))==NULL && (name[0]=='P' && ((name1=is_pdb_struc (name+1))==NULL)))
1464 fprintf ( stderr, "\nERROR Could not download structure %s [FATAL:%s]\n", name, PROGRAM);crash("");
1466 else if ( (start==0) && (end==0))return name1;
1469 declare_name(name2);
1470 sprintf ( name2, "%s_%d_%d.pdb", name, start, end);
1471 sprintf ( command, "extract_from_pdb -infile \'%s\' -chain FIRST -coor %d %d > %s%s",check_file_exists(name1),start, end, get_cache_dir(),name2);
1472 if ( getenv4debug ("DEBUG_EXTRACT_FROM_PDB"))fprintf ( stderr, "\n[DEBUG_EXTRACT_FROM_PDB:get_pdb_struc] %s\n", command);
1473 my_system (command);
1475 if ( is_pdb_file(name2))return name2;
1478 fprintf ( stderr, "\nERROR Could not extract segment [%d %d] from structure %s [FATAL:%s]\n",start, end, name, PROGRAM);crash("");
1480 myexit (EXIT_FAILURE);
1486 char* seq_is_pdb_struc ( Sequence *S, int i)
1489 if (!S){return NULL;}
1490 else if ( !S->T[i]){return NULL;}
1491 else if ( !((S->T[i])->P)){return NULL;}
1492 else return ((S->T[i])->P)->template_file;
1494 char* is_pdb_struc ( char *name)
1497 checks if this is the name of a local file that contains PDB data
1498 checks if this is the name of a file from a local db
1499 put the file in the cache
1500 checks if this is a file from a remote db (extract_from_pdb
1501 return NULL if everything fails
1504 static char *file_name1;
1505 static char *file_name2;
1507 static char **buf_names;
1508 static char **buf_result;
1516 if ( !name || name[0]=='\0')return NULL;
1523 buf_names=vcalloc ( 1000, sizeof (char*));
1524 buf_result=vcalloc ( 1000, sizeof (char*));
1525 file_name1=vcalloc ( 1000, sizeof (char));
1526 file_name2=vcalloc ( 1000, sizeof (char));
1528 if ( (s=name_is_in_list ( name, buf_names,nbuf,-1))!=-1)return buf_result[s];
1532 sprintf ( file_name1, "%s", name);
1533 sprintf ( file_name2, "%s.pdb", name);
1536 if (is_pdb_file(file_name1)){r=file_name1;}
1537 else if (is_pdb_file(file_name2)){r=file_name2;}
1538 else if (is_pdb_name (name))
1540 printf_system ("extract_from_pdb -netfile \'%s\' > %s/%s 2>/dev/null",name, get_cache_dir(), file_name2);
1541 if ( is_pdb_file(file_name2))r=file_name2;
1548 buf_names[nbuf]=vcalloc ( strlen (name)+1, sizeof (char));
1549 sprintf ( buf_names[nbuf], "%s", name);
1552 buf_result[nbuf]=vcalloc ( strlen (r)+1, sizeof (char));
1553 sprintf (buf_result[nbuf], "%s", r);
1555 else buf_result[nbuf]=NULL;
1561 char *fix_pdb_file ( char *in)
1565 empty=vcalloc(1, sizeof(char));
1567 if ( !in || !check_file_exists (in))return empty;
1568 else if ( is_pdb_file(in))return in;
1571 char command[10000];
1575 tmp2=vcalloc (strlen (tmp)+1, sizeof (char));
1576 sprintf (tmp2, "%s", tmp);
1577 sprintf ( command, "extract_from_pdb %s > %s", check_file_exists(in), tmp2);
1578 my_system (command);
1579 if ( is_pdb_file (tmp))return tmp2;
1585 int is_sap_file ( char *name)
1589 if (!check_file_exists(name))return 0;
1591 if ((fp=find_token_in_file (name, NULL, "Percent"))!=NULL)
1593 if ((fp=find_token_in_file (name,fp, "Percent"))!=NULL)
1610 int is_blast_file ( char *name)
1612 if ( !check_file_exists(name) ) return 0;
1613 else if (token_is_in_file (name, "<SequenceSimilaritySearchResult>"))
1619 if (token_is_in_file (name, "Lambda") && token_is_in_file (name, "Altschul,"))
1630 int is_simple_pdb_file ( char *name)
1633 if ((fp=find_token_in_file (name, NULL, "SIMPLE_PDB_FORMAT"))!=NULL){vfclose (fp);return 1;}
1638 int is_pdb_file ( char *name)
1643 if ( name==NULL) return 0;
1644 if (!check_file_exists (name))return 0;
1646 if ((fp=find_token_in_file (name, NULL, "\nHEADER"))!=NULL)
1650 if ((fp=find_token_in_file (name, NULL, "\nSEQRES"))!=NULL)
1656 if ((fp=find_token_in_file (name, NULL, "\nATOM"))!=NULL)
1667 if ( ispdb>=2)return 1;
1670 int is_seq ( char *name)
1674 if ( !check_file_exists(name))return 0;
1676 format= identify_seq_format(name);
1677 if(!format || format[0]=='\0'){vfree (format);return 0;}
1678 else if (strstr(format, "seq")){vfree (format);return 1;}
1681 int is_aln ( char *name)
1684 if ( !check_file_exists (name))return 0;
1686 format= identify_seq_format(name);
1687 if ( !format || format[0]=='\0'){vfree (format);return 0;}
1688 else if (strstr(format, "aln")){vfree (format); return 1;}
1692 int is_matrix (char *name)
1696 if ((m=read_matrice (name))!=NULL){free_int (m, -1); return 1;}
1699 int is_newick (char *name)
1705 fp=vfopen (name, "r");
1706 if ( (c=fgetc(fp))!='('){vfclose (fp); return 0;}
1709 while ( (c=fgetc(fp))!=EOF)
1711 if ( c==';'){vfclose (fp); return 1;}
1717 int is_clustalw_matrix ( char *name)
1723 if ( (fp=find_token_in_file (name, NULL, "CLUSTALW_MATRIX"))!=NULL){vfclose(fp);return 1;}
1726 int is_pavie_matrix ( char *name)
1732 if ( (fp=find_token_in_file (name, NULL, "PAVIE_MATRIX"))!=NULL){vfclose(fp);return 1;}
1735 int is_distance_matrix_file (char *name)
1738 if ( (fp=find_token_in_file (name, NULL, "TC_DISTANCE_MATRIX_FORMAT_01"))!=NULL){vfclose(fp);return 1;}
1741 int is_similarity_matrix_file (char *name)
1744 if ( (fp=find_token_in_file (name, NULL, "TC_SIMILARITY_MATRIX_FORMAT_01"))!=NULL){vfclose(fp);return 1;}
1747 int is_blast_matrix ( char *name)
1753 if ( (fp=find_token_in_file (name, NULL, "BLAST_MATRIX"))!=NULL){vfclose(fp);return 1;}
1757 int is_single_seq_weight_file ( char *name)
1761 return token_is_in_file ( name, "SINGLE_SEQ_WEIGHT_FORMAT_01");
1764 int is_stockholm_aln (char *file)
1768 if ((fp=find_token_in_file_nlines (file, NULL, "STOCKHOLM",2)))
1776 int is_lib ( char *name)
1778 return is_lib_01(name);
1781 int is_lib_02 ( char *name)
1784 return token_is_in_file ( name, "TC_LIB_FORMAT_02");
1788 int is_lib_01 (char *name)
1792 if ( token_is_in_file ( name, "TC_LIB_FORMAT_01")) return 1;
1793 else if (token_is_in_file ( name, "T-COFFEE_LIB_FORMAT_01"))return 1;
1794 else if (token_is_in_file (name, "SEQ_1_TO_N"))return 1;
1797 int is_lib_list ( char *name)
1799 if ( !check_file_exists (name))return 0;
1800 if ( token_is_in_file ( name, "TC_LIB_LIST_FORMAT_01")) return 1;
1803 int is_method ( char *file)
1808 sprintf ( new_file, "%s", file);
1809 if ( (token_is_in_file(new_file, "TC_METHOD_FORMAT_01"))){return 1;}
1810 if ( is_in_pre_set_method_list(new_file))
1813 vremove ( new_file);
1823 /*******************************************************************************************/
1826 /* SEQUENCE FORMAT IDENTIFIERS */
1828 /***************************************************************************************** */
1829 int type_is_exon_boundaries(char **seq, int n)
1836 if ( strchr ("bojBOJ", seq[a][b]))return 1;
1841 int format_is_oligo(char *file)
1847 fp=vfopen ( file, "r");
1852 if ( strm (buf, "ALPHABET"))r=1;
1858 int format_is_msf ( char *file)
1865 if ( (fp=find_token_in_file_nlines (file,NULL,"MSF:", 30))!=NULL){vfclose (fp);return 1;}
1871 fp=vfopen ( file, "r");
1872 fscanf (fp , "%s", buf);
1875 if ( strm (buf, "MSF:"))return 1;
1880 int format_is_fasta_aln ( char *file)
1883 if ( format_is_fasta(file) && !format_is_fasta_seq(file))return 1;
1888 int format_is_fasta_seq ( char *file)
1893 if ( format_is_fasta (file))
1895 S=get_fasta_sequence (file, NULL);
1897 else if ( !S->seq[0]){free_sequence (S, S->nseq); return 1;}
1898 l=strlen ( S->seq[0]);
1899 for ( a=0; a< S->nseq; a++)if(strlen(S->seq[a])!=l){free_sequence (S, S->nseq);return 1;}
1900 for ( a=0; a< S->nseq; a++)
1902 l1=strlen ( S->seq[a]);
1904 l2=strlen ( S->seq[a]);
1907 free_sequence (S, S->nseq);
1911 free_sequence (S, S->nseq);
1920 int format_is_fasta ( char *file)
1924 if ( !check_file_exists(file))return 0;
1926 if ( get_first_non_white_char (file)!='>')return 0;
1927 if ( !(S=get_fasta_sequence (file, NULL)))return 0;
1928 free_sequence (S, -1);
1929 if ( format_is_pir(file)) return 0;
1933 int format_is_pir_aln ( char *file)
1936 if ( format_is_pir(file) && !format_is_pir_seq(file))return 1;
1940 int format_is_pir_seq ( char *file)
1946 if ( format_is_pir (file))
1948 S=get_pir_sequence (file, NULL);
1949 for ( a=0; a< S->nseq; a++)
1951 l1=strlen ( S->seq[a]);
1953 l2=strlen ( S->seq[a]);
1956 free_sequence (S, S->nseq);
1969 int format_is_pir ( char *file)
1972 int pir_name=1, star_end=1, a;
1974 S=get_fasta_sequence (file, NULL);
1976 else if (!S->seq[0])return 0;
1978 pir_name=1; star_end=1;
1979 for (a=0; a< S->nseq; a++)
1982 if (!is_pir_name(S->name[a]))pir_name=0;
1983 l=strlen (S->seq[a]);
1984 if (!l || (l && S->seq[a][l-1]!='*'))
1987 free_sequence(S,-1);
1988 if ( pir_name && star_end) return 1;
1991 int is_pir_name (char *name)
1993 if ( strstr (name, "P1;"))return 1;
1994 if ( strstr (name, "F1;"))return 1;
1995 if ( strstr (name, "DL;"))return 1;
1996 if ( strstr (name, "DC;"))return 1;
1997 if ( strstr (name, "RL;"))return 1;
1998 if ( strstr (name, "RC;"))return 1;
1999 if ( strstr (name, "XX;"))return 1;
2004 int format_is_conc_aln (char *file)
2007 if ( (fp=find_token_in_file (file, NULL, "CONC_MSF_FORMAT_01"))){vfclose (fp); return 1;}
2010 int format_is_saga ( char *file)
2018 if ( (fp=find_token_in_file (file, NULL, "SAGA"))){vfclose (fp); return 1;}
2019 else if ((fp=find_token_in_file (file, NULL, "CLUSTAL"))){vfclose (fp); return 1;}
2020 else if ((fp=find_token_in_file (file, NULL, "ClustalW"))){vfclose (fp); return 1;}
2021 else if ((fp=find_token_in_file (file, NULL, "clustalw"))){vfclose (fp); return 1;}
2022 else if ((fp=find_token_in_file (file, NULL, "clustal"))){vfclose (fp); return 1;}
2023 else if ((fp=find_token_in_file (file, NULL, "T-COFFEE_MSA"))){vfclose (fp); return 1;}
2024 else if ((fp=find_token_in_file (file, NULL, "INTERLEAVED_MSA"))){vfclose (fp); return 1;}
2029 else if ((fp=find_token_in_file (file, NULL, "T-COFFEE"))){vfclose (fp); return 1;}
2030 else if ((fp=find_token_in_file (file, NULL, "SAGA_FORMAT"))){vfclose (fp); return 1;}
2031 else if ((fp=find_token_in_file (file, NULL, "GARP"))){vfclose (fp); return 1;}
2032 else if ((fp=find_token_in_file (file, NULL, "INTERLEAVED"))){vfclose (fp); return 1;}
2036 list=get_file_block_pattern (file,&n_blocks,100);
2037 if (n_blocks<=2){free_int (list, -1);return 0;}
2041 for ( a=1; a< n_blocks-1; a++)
2043 if ( list[a][0]!=n_seq){free_int (list, -1);return 0;}
2046 for ( b=1; b<=list[a][0]; b++)
2047 if ( list[a][b]!=2){free_int (list, -1);return 0;}
2058 int format_is_swissprot (char *name)
2062 if ( !check_file_exists(name))return 0;
2067 if ( (fp=find_token_in_file_nlines (name,NULL,"\nID ",10))!=NULL\
2068 &&(fp=find_token_in_file (name,NULL,"\nSQ "))!=NULL )
2071 vfclose (fp);return 1;
2079 /*******************************************************************************************/
2084 /***************************************************************************************** */
2085 int output_format_aln ( char *format, Alignment *inA, Alignment *inEA,char *name)
2087 Sequence_data_struc *D1=NULL;
2088 Sequence_data_struc *D2=NULL;
2093 A =copy_aln (inA, NULL);
2095 EA=copy_aln (inEA,NULL);
2097 EA=expand_number_aln(inA,EA);
2100 if (A && A->expanded_order )A=reorder_aln ( A, A->expanded_order,A->nseq);
2101 if (EA && EA->expanded_order)EA=reorder_aln ( EA, EA->expanded_order,EA->nseq);
2104 D1=vcalloc ( 1, sizeof (Sequence_data_struc));
2108 D2=vcalloc ( 1, sizeof (Sequence_data_struc));
2112 main_output ( D1, NULL,D2, format, name);
2120 int main_output (Sequence_data_struc *D1, Sequence_data_struc *D2, Sequence_data_struc *DST, char *out_format, char *out_file)
2128 if ( !out_format[0])return 0;
2129 if ( D1 && D1->rm_gap)ungap_aln ((D1->A));
2131 if ( (strstr (out_format, "expanded_")))
2134 out_format+=strlen ("expanded_");
2135 BUF_A=copy_aln (D1->A, NULL);
2136 (D1->A)=thread_profile_files2aln ((D1->A), NULL, NULL);
2140 if ( strm (out_format, ""))return 0;
2141 else if ( ( strm (out_format, "aln2lib")))
2145 Constraint_list *CL;
2153 pos=aln2pos_simple(IN, IN->nseq);
2154 fp=vfopen (out_file, "w");
2155 fp=save_list_header (fp,CL);
2158 for ( b=0; b< IN->nseq-1; b++)
2160 for ( c=b+1; c< IN->nseq; c++)
2164 fprintf ( fp, "#%d %d\n", s1+1, s2+1);
2165 for ( a=0; a< IN->len_aln; a++)
2170 if ( s1==s2 && !CL->do_self)continue;
2172 if ( s1< s2)s=(CL->evaluate_residue_pair)( CL, s1, r1, s2, r2);
2173 else s=(CL->evaluate_residue_pair)( CL, s2, r2, s1, r1);
2175 s=(s!=UNDEFINED)?s:0;
2178 fprintf (fp, "\t%5d %5d %5d \n", r1, r2, s);
2183 vfclose (save_list_footer (fp, CL));
2185 else if ( strncmp (out_format, "score",5)==0 || strm (out_format, "html"))
2192 fprintf ( stderr,"\n[You Need an evaluation File: Change the output format or use +evaluate][FATAL:%s]\n", PROGRAM);
2193 myexit(EXIT_FAILURE);
2195 if ( !strm ("html", out_format))while ( out_format[0]!='_' && out_format[0]!='\0' )out_format++;
2197 D1->S=aln2seq(D1->A);
2198 BUF=copy_aln (DST->A, NULL);
2199 DST->A=aln2number (DST->A);
2201 if ( strstr ( out_format, "html" ))output_reliability_html ( D1->A, DST->A, out_file);
2202 else if( strm ( out_format, "_ps" ))output_reliability_ps ( D1->A, DST->A, out_file);
2203 else if( strm ( out_format, "_pdf" ))output_reliability_pdf ( D1->A, DST->A, out_file);
2204 else if( strm ( out_format, "_ascii" ))output_reliability_ascii ( D1->A, DST->A, out_file);
2205 else if( strm ( out_format, "_seq" ))output_seq_reliability_ascii ( D1->A, DST->A, out_file);
2209 main_output (DST, NULL, NULL, out_format+1, out_file);
2212 else if (strm (out_format, "sec_html") || strm (out_format, "_E_html"))
2224 ST=copy_aln (A, NULL);
2225 for (a=0; a<ST->nseq; a++)
2227 i=name_is_in_list (ST->name[a],S->name, S->nseq, 100);
2230 buf=seq2E_template_string(S, i);
2231 if ( buf==NULL)continue;
2233 for (c=0,b=0; b<ST->len_aln; b++)
2236 r1=ST->seq_al[a][b];
2241 else if (s=='h')r1='9';
2242 else if (s=='c')r1='5';
2245 ST->seq_al[a][b]=r1;
2252 add_warning ( stderr, "Cannot output tm_html:_E_ template file (sec. struc.) is required for this output ", PROGRAM);
2254 output_color_html ( A, ST, out_file);
2256 else if (strm (out_format, "tm_html") || strm (out_format, "_T_html"))
2268 ST=copy_aln (A, NULL);
2269 for (a=0; a<ST->nseq; a++)
2271 i=name_is_in_list (ST->name[a],S->name, S->nseq, 100);
2274 buf=seq2T_template_string(S, i);
2275 if ( buf==NULL)continue;
2277 for (c=0,b=0; b<ST->len_aln; b++)
2280 r1=ST->seq_al[a][b];
2285 else if (s=='h')r1='9';
2286 else if (s=='i')r1='5';
2289 ST->seq_al[a][b]=r1;
2296 add_warning ( stderr, "Cannot output tm_html:_T_ template file (trans. Memb. ) is required for this output ", PROGRAM);
2298 output_color_html ( A, ST, out_file);
2301 else if (strm (out_format, "color_exoset"))
2303 Alignment *ST, *EX, *A;
2304 Constraint_list *CL;
2310 printf_exit ( EXIT_FAILURE, stderr, "\nYou must provide an obj file via the -struc_in flag [FATAL:%s]", PROGRAM);
2315 CL=declare_constraint_list ( DST->S,NULL, NULL, 0,NULL, read_matrice("pam250mt"));
2317 ST=copy_aln (A, NULL);
2318 buf=vcalloc ( EX->len_aln+1, sizeof (int));
2320 for ( a=0; a< A->nseq; a++)
2324 i=name_is_in_list (A->name[a],EX->name, EX->nseq, -1);
2325 if ( i==-1)continue;
2327 sprintf ( buf, "%s", EX->seq_al[i]);
2330 for (n=0,b=0; b<A->len_aln; b++)
2332 if (!is_gap(A->seq_al[a][b]))
2335 ST->seq_al[a][b]='0';
2336 else if ( buf[n]=='j')
2337 ST->seq_al[a][b]='1';
2338 else if ( buf[n]=='b')
2339 ST->seq_al[a][b]='2';
2346 output_color_html ( A, ST, out_file);
2347 return EXIT_SUCCESS;
2350 else if (strm (out_format, "color_protogene"))
2353 DST->A=copy_aln (D1->A, NULL);
2354 for (n=1,a=0; a< (D1->A)->len_aln; a++, n++)
2356 for ( b=0; b<(D1->A)->nseq; b++)
2358 if (is_gap((D1->A)->seq_al[b][a]));
2359 else if ( n<=3)(DST->A)->seq_al[b][a]=2;
2360 else if ( n>3)(DST->A)->seq_al[b][a]=9;
2365 output_color_html ( D1->A, DST->A, out_file);
2366 return EXIT_SUCCESS;
2369 else if ( strncmp (out_format, "color",5)==0)
2377 fprintf ( stderr,"\n[You Need an evaluation File: Change the output format or use +evaluate][FATAL:%s]\n", PROGRAM);
2378 myexit(EXIT_FAILURE);
2380 while ( out_format[0]!='_' && out_format[0]!='\0' )out_format++;
2382 BUF=copy_aln (DST->A, NULL);
2387 if ( strm ( out_format, "_html" ))output_color_html ( D1->A, DST->A, out_file);
2388 else if( strm ( out_format, "_ps" ))output_color_ps ( D1->A, DST->A, out_file);
2389 else if( strm ( out_format, "_pdf" ))output_color_pdf ( D1->A, DST->A, out_file);
2390 else if( strm ( out_format, "_ascii" ))output_color_ascii ( D1->A, DST->A, out_file);
2394 return main_output (DST, NULL, NULL, out_format+1, out_file);
2396 return EXIT_SUCCESS;
2398 else if ( strm4 ( out_format, "tc_aln","t_coffee_aln", "t_coffee", "tcoffee"))
2401 vfclose (output_aln ( D1->A, vfopen (out_file, "w")));
2403 else if ( strm ( out_format, "analyse_pdb"))
2408 fprintf ( stderr,"\n[You Need an evaluation File: Change the output format][FATAL:%s]\n", PROGRAM);
2409 myexit(EXIT_FAILURE);
2411 analyse_pdb ( D1->A,DST->A, "stdout");
2412 (DST->A)=aln2number (DST->A);
2413 output_reliability_ps ( D1->A, DST->A, out_file);
2415 else if ( strm4 ( out_format, "lower0", "lower1", "lower2", "lower3") || strm4(out_format, "lower4", "lower5", "lower6", "lower7") || strm4 (out_format,"lower8", "lower9", "align_pdb", "malign_pdb") )
2420 fprintf ( stderr,"\n[You Need an evaluation File: Change the output format][FATAL:%s]\n", PROGRAM);
2421 myexit(EXIT_FAILURE);
2426 (DST->A)=aln2number (DST->A);
2427 if ( strm (out_format, "align_pdb"))value=0;
2428 else if ( strm (out_format, "malign_pdb"))value=5;
2429 else value=atoi(out_format+5);
2431 D1->A=filter_aln_upper_lower (D1->A, DST->A,0, value);
2432 output_clustal_aln ( out_file, D1->A);
2434 else if ( strnm (out_format, "repeat", 6))
2441 size=atoi (out_format+6);
2443 CONC=declare_aln2 ( (D1->A)->nseq, ((D1->A)->len_aln+1)*size+1);
2445 for ( a=0; a< (D1->A)->nseq; a++)(D1->A)->seq_al[a][(D1->A)->len_aln]='\0';
2446 for ( c=0,a=0; a< (D1->A)->nseq;c++)
2449 sprintf ( CONC->name[c], "%s", (D1->A)->name[a]);
2450 for ( b=0; b<size; b++, a++)
2452 strcat (CONC->seq_al[c], (D1->A)->seq_al[a]);
2453 strcat (CONC->seq_al[c], "O");
2456 CONC->nseq=c;CONC->len_aln=strlen (CONC->seq_al[0]);
2457 output_clustal_aln ( out_file, CONC);
2461 else if ( strnm (out_format, "upper", 5))
2467 fprintf ( stderr,"\n[You Need an evaluation File: Change the output format][FATAL:%s]\n", PROGRAM);
2468 myexit(EXIT_FAILURE);
2472 (DST->A)=aln2number (DST->A);
2474 value=atoi(out_format+5);
2476 D1->A=filter_aln_lower_upper (D1->A, DST->A,0, value);
2477 output_clustal_aln ( out_file, D1->A);
2480 else if ( strm4 ( out_format, "filter0", "filter1", "filter2", "filter3"))
2485 fprintf ( stderr,"\n[You Need an evaluation File: Change the output format][FATAL:%s]\n", PROGRAM);
2486 myexit(EXIT_FAILURE);
2489 (DST->A)=aln2number (DST->A);
2491 D1->A=filter_aln (D1->A, DST->A, atoi(out_format+6));
2492 output_clustal_aln ( out_file, D1->A);
2495 else if ( strm3 ( out_format, "phylip_aln", "phylip", "phy"))
2498 output_phylip_aln ( out_file, D1->A);
2500 else if ( strm ( out_format, "mocca_aln"))
2503 output_mocca_aln ( out_file, D1->A, DST->A);
2505 else if ( strm ( out_format, "saga_pw_sd_weights") )
2508 output_pw_weights4saga ((D1->W),(D1->W)->PW_SD, out_file);
2510 else if ( strm ( out_format, "saga_aln"))
2513 output_saga_aln (out_file, D1->A);
2515 else if (strm2 ( out_format, "aln","clustal_tc")|| strm (out_format, "msa"))
2519 output_clustal_aln (out_file, D1->A);
2521 else if (strm5 ( out_format, "strict_clustal","clustal_aln", "clustalw","clustal", "clustalw_aln") || strm (out_format,"number_aln"))
2524 output_strict_clustal_aln (out_file, D1->A);
2526 else if ( strm ( out_format, "conc_aln"))
2529 output_conc_aln (out_file, D1->A);
2531 else if ( strm2 ( out_format, "lalign_aln","lalign"))
2534 output_lalign (out_file, D1->A);
2536 else if ( strm2 ( out_format, "glalign_aln","glalign"))
2539 output_glalign (out_file, D1->A, DST->A);
2542 else if ( strm2 ( out_format, "fasta_aln","fasta" ) || strm (out_format, "blast_aln"))
2545 output_fasta_aln( out_file, D1->A);
2547 else if ( strstr (out_format, "overaln"))
2554 F=vcalloc (1, sizeof (OveralnP));
2556 string_array_upper ((D1->A)->seq_al, (D1->A)->nseq);
2559 D1->A=mark_exon_boundaries (D1->A, D2->A);
2562 else if ( (s=get_string_variable ("exon_boundaries")))
2566 EB=seq2aln(S=main_read_seq(s),NULL, 0);
2567 D1->A=mark_exon_boundaries (D1->A, EB);
2568 free_sequence (S, S->nseq); free_aln (EB);
2571 if ( strstr (out_format, "lower")) sprintf (F->mode,"lower");
2572 else if (strstr (out_format, "unalign2"))sprintf (F->mode, "unalign2");
2573 else if (strstr (out_format, "unalign"))sprintf (F->mode, "unalign");
2574 else sprintf (F->mode, "%s", ((s=get_string_variable ("overaln_mode")))?s:"lower");
2575 if (!strm (F->mode, "lower") && !strm (F->mode, "unalign") && !strm (F->mode, "unalign2"))printf_exit (EXIT_FAILURE,stderr,"\nERROR: unknown overaln_mode in overaln output [%s] [FATAL:%s]", mode, PROGRAM);
2577 if (int_variable_isset ("overaln_threshold"))F->t=get_int_variable ("overaln_threshold");
2578 if (int_variable_isset ("overaln_target"))F->f=get_int_variable ("overaln_target");
2579 if (int_variable_isset ("overaln_P1"))F->p1=get_int_variable ("overaln_P1");
2580 if (int_variable_isset ("overaln_P2"))F->p2=get_int_variable ("overaln_P2");
2581 if (int_variable_isset ("overaln_P3"))F->p3=get_int_variable ("overaln_P3");
2582 if (int_variable_isset ("overaln_P4"))F->p4=get_int_variable ("overaln_P4");
2584 if (eb)sprintf (F->model, "fsa2");
2585 else sprintf (F->model, "fsa1");
2586 D1->A=aln2clean_pw_aln (D1->A, F);
2588 //if (eb)D1->A=aln2clean_pw_aln (D1->A, mode,t, f,p1,p2,p3, "fsa2");
2589 //else D1->A=aln2clean_pw_aln (D1->A, mode,t, f,p1,p2,p3, "fsa1");
2591 D1->S=aln2seq(D1->A);
2592 output_clustal_aln (out_file, D1->A);
2594 else if ( strm ( out_format, "est_prf" ))
2597 output_est_prf( out_file, D1->A);
2599 else if ( strm ( out_format, "clean_est_fasta_seq" ))
2602 D1->A=clean_est(D1->A);
2603 output_fasta_seq(out_file, D1->A);
2607 else if ( strm3 ( out_format, "msf_aln", "gcg", "msf"))
2610 output_msf_aln( out_file, D1->A);
2612 else if ( strm ( out_format, "rnalign"))
2615 output_rnalign (out_file, D1->A, DST->S);
2617 else if ( strm ( out_format, "tblastx_db1"))
2619 seq2tblastx_db (out_file,D1->S,1);
2621 else if ( strm ( out_format, "tblastx_db") || strm (out_format, "tblastx_db3"))
2623 seq2tblastx_db (out_file,D1->S,3);
2625 else if ( strm ( out_format, "tblastx_db2"))
2627 seq2tblastx_db (out_file,D1->S,2);
2629 else if ( strm ( out_format, "fasta_seq") ||strm ( out_format, "list")||strm ( out_format, "file_list"))
2633 output_fasta_seq (out_file,D1->A);
2635 else if (strm (out_format, "fasta_tree") )
2638 output_fasta_tree (out_file,D1->A);
2641 else if ( strm ( out_format, "gotoh_seq"))
2644 output_gotoh_seq (out_file,D1->A);
2646 else if ( strm (out_format, "fasta_seq1"))
2649 output_fasta_seq1 (out_file, D1->A);
2651 else if ( strm2 (out_format, "pir_aln", "pir"))
2654 output_pir_aln (out_file, D1->A);
2656 else if ( strm (out_format, "pir_seq"))
2659 output_pir_seq (out_file, D1->A);
2661 else if ( strm (out_format, "gor_seq"))
2664 output_gor_seq (out_file, D1->A);
2666 else if ( strm (out_format, "pir_seq1"))
2669 output_pir_seq1 (out_file, D1->A);
2671 else if ( strm (out_format, "pw_lib_saga_aln"))
2674 output_pw_lib_saga_aln (out_file, D1->A);
2676 else if ( strm (out_format, "lib"))
2679 output_lib (out_file, D1->A);
2681 else if ( strm (out_format, "pdb_constraint_list"))
2684 output_constraints (out_file, "pdb",D1->A);
2686 else if ( strm2 (out_format, "constraint_list","tc_lib"))
2690 else if (!D1->CL)output_constraints (out_file,"sim", D1->A);
2691 else if (D1->CL) vfclose ( save_constraint_list ( D1->CL, 0, (D1->CL)->ne, out_file, NULL, "ascii",(D1->CL)->S));
2693 else if ( strm2 (out_format, "extended_lib","extended_cosmetic"))
2696 output_constraints (out_file,out_format, D1->A);
2698 else if ( strncmp (out_format, "extended_pair", 13)==0)
2701 output_constraints (out_file,out_format, D1->A);
2703 else if ( strm (out_format, "cache_id"))
2707 output_saga_aln (out_file, D1->A);
2709 else if ( strm (out_format, "compress_aln"))
2712 compress_aln (D1->A);
2713 output_saga_aln (out_file, D1->A);
2715 else if (strm (out_format, "n_seq") ||strm (out_format, "nseq") )
2718 fp=vfopen ( out_file, "w");
2719 fprintf ( fp, "%d\n", (D1->A)->nseq);
2723 else if ( strm ( out_format, "thread_dna_on_prot_aln"))
2726 D1->A=thread_dnaseq_on_prot_aln (D1->S, D2->A);
2727 output_saga_aln ( out_file, D1->A);
2729 else if ( strm ( out_format, "tdna_fasta_seq1"))
2731 D1->A=translate_dna_aln (D1->A,0);
2732 output_fasta_seq1 (out_file, D1->A);
2734 else if (strm (out_format, "exons"))
2737 //exons come in upper case
2738 //output alternates them upper/lower
2740 A=copy_aln (D1->A, NULL);
2741 A->seq_al=gene2exons(A->seq_al,A->nseq);
2742 output_fasta_seq (out_file,A);
2745 else if ( strm (out_format, "wexons"))
2748 output_wexons (out_file,D1->A);
2751 else if ( strm (out_format, "texons"))
2755 //exons come in upper case
2756 //output alternate amino acids in upper/lower case
2757 //amino acid has the case of its first nucleotide
2759 A=copy_aln (D1->A, NULL);
2760 A->seq_al=gene2exons(A->seq_al,A->nseq);
2762 output_fasta_seqS (out_file,S=translate_dna_seqS(S,1,'X'));
2764 else if ( strm (out_format, "sexons"))
2768 //exons come in upper case
2769 //output alternate amino acids in upper/lower case
2770 //amino acid has the case of its first nucleotide
2772 A=copy_aln (D1->A, NULL);
2773 output_fasta_seq ( out_file, D1->A);
2776 else if ( strm ( out_format, "tdna_aln"))
2778 D1->A=translate_dna_aln (D1->A,0);
2779 output_saga_aln ( out_file, D1->A);
2781 else if ( strm ( out_format, "cdna_fasta_seq1"))
2783 D1->A= gene2prot(D1->A);
2784 output_fasta_seq1 ( out_file, D1->A);
2786 else if ( strm ( out_format, "mutate_cdna_aln"))
2788 D1->A= mutate_cdna_aln ( D1->A);
2789 output_clustal_aln ( out_file, D1->A);
2791 else if ( strm ( out_format, "tdna_sp_aln"))
2795 fprintf ( stderr,"\n[You Need an evaluation File: Change the output format][FATAL:%s]\n", PROGRAM);
2796 myexit(EXIT_FAILURE);
2798 (DST->A)=aln2number (DST->A);
2799 D1->A=translate_splice_dna_aln (D1->A, DST->A);
2800 output_saga_aln ( out_file, D1->A);
2802 else if (out_format && out_format[0] && (strcmp ( out_format,"rna_graph_fasta")==0))
2805 sprintf ( (D1->A)->seq_al[0], "%s",(DST->S)->seq[0]);
2807 output_fasta_seq (out_file, DST->A);
2809 else if (strm ( out_format, "freq_mat"))
2812 output_freq_mat (out_file, D1->A);
2814 else if (strm ( out_format, "maln_pval"))
2816 output_maln_pval ( out_file, D1->A);
2818 else if ( strm ( out_format, "model_aln"))
2821 output_model_aln ( out_file, D1->A);
2823 else if (strncmp (out_format, "mult",4)==0)
2826 output_mult_fasta_seq ( out_file, D1->A, atoi(out_format+4));
2828 else if (strm (out_format, "conservation"))
2830 output_conservation_statistics (out_file, D1->A);
2832 else if (strm (out_format, "len"))
2835 output_statistics (out_file, D1->A, "nrl");
2837 else if ( strm (out_format, "name"))
2840 if ( D1->A)output_statistics (out_file, D1->A, "n");
2844 TS=tree2seq(D1->T, NULL);print_array_char (vfopen(out_file, "w"), TS->name, TS->nseq, "\n");
2847 else if ( strm (out_format, "code_name"))
2853 if ( D1->A){n=(D1->A)->nseq, nl=(D1->A)->name;}
2854 if ( D1->T){TS=tree2seq(D1->T, NULL);nl=TS->name;n=TS->nseq;}
2856 lfp=vfopen (out_file, "w");
2857 for ( num=0; num<n; num++)
2858 fprintf (lfp, "\n%s C%d", nl[num], num+1);
2859 fprintf (lfp, "\n");
2862 else if ( strm ( out_format, "seq2struc"))
2864 output_seq2struc (out_file, D1->A);
2866 else if ( strstr ( out_format, "pavie_age_channel"))
2868 output_n_pavie_age_channel ( D1->S,out_file, atoi((out_format+strlen ("pavie_age_channel"))));
2869 return EXIT_SUCCESS;
2871 else if ( strstr ( out_format, "age_matrix"))
2873 output_age_matrix (out_file, atoi((out_format+10)));
2875 else if ( strm ( out_format, "transitions"))
2877 output_transitions (out_file, D1->A);
2880 else if ( strncmp (out_format, "statistics",10)==0)
2884 output_statistics (out_file, D1->A,out_format+10);
2890 else if ( strm4 (out_format, "newick_tree","newick","binary","nh"))
2894 /*D1->T=unroot_tree(D1->T);*/
2895 vfclose (print_tree ((D1->T), out_format, vfopen ( out_file, "w")));
2897 else if ( strncmp (out_format, "sarsim", 6)==0)
2900 compare_sar_sequence (D1->S, (D2 &&D2->S)?D2->S:D1->S, atoi(out_format+6));
2901 return EXIT_SUCCESS;
2903 else if ( strncmp (out_format, "sim",3)==0)
2906 output_similarities (out_file, D1->A,out_format);
2909 else if ( strncmp (out_format, "cov",3)==0)
2912 output_similarities (out_file, D1->A,out_format);
2914 else if ( strm (out_format, "stockholm_aln"))
2916 output_stockholm_aln (out_file,D1->A, (D2)?D2->A:NULL);
2918 else if ( strm (out_format, "pair_sim"))
2922 fprintf ( stderr, "\n-output=pair_sim: provide aln1 via -in and aln2 via -in2 [FATAL:%s]\n", PROGRAM);
2923 myexit (EXIT_FAILURE);
2925 output_similarities_pw (out_file, D1->A,D2->A,out_format);
2927 else if ( strm (out_format, "matrix") || strm (out_format, "blast_matrix"))
2929 output_blast_mat (D1->M, out_file);
2931 else if ( strm (out_format, "header_matrix"))
2933 output_header_mat(D1->M, out_file);
2939 fprintf ( stderr, "\n%s is an UNKNOWN OUTPUT FORMAT [FATAL:%s]\n",out_format, PROGRAM);
2940 myexit (EXIT_FAILURE);
2944 //Remove the expansion
2952 int is_in_format_list ( char *name)
2954 if ( strcmp ( name, "saga_aln")==0)return 1;
2955 if ( strcmp ( name, "number_aln")==0)return 1;
2956 if ( strcmp ( name, "clustal_aln")==0)return 1;
2957 if ( strcmp ( name, "fasta_aln")==0)return 1;
2958 if ( strcmp ( name, "number_fasta")==0)return 1;
2959 if ( strcmp ( name, "fasta_seq")==0)return 1;
2960 if ( strcmp ( name, "pdb")==0)return 1;
2961 if ( strcmp ( name, "msf_aln")==0)return 1;
2962 if ( strcmp ( name, "dali_aln")==0)return 1;
2963 if ( strcmp ( name, "dali_seq")==0)return 1;
2964 if ( strcmp ( name, "barton_list_tc")==0)return 1;
2965 if ( strcmp ( name, "est_prf")==0)return 1;
2967 if ( strcmp ( name, "gotoh_aln")==0)return 1;
2968 if ( strcmp ( name, "amps_aln")==0)return 1;
2969 if ( strcmp ( name, "pir_aln")==0)return 1;
2970 if ( strcmp ( name, "pir_seq")==0)return 1;
2971 if ( strcmp ( name, "est_fasta")==0)return 1;
2972 if ( strcmp ( name, "amps_sd_scores")==0)return 1;
2973 if ( strcmp ( name, "pima_aln")==0)return 1;
2974 if ( strcmp ( name, "dialign_aln")==0)return 1;
2975 if ( strcmp ( name, "gor_seq")==0)return 1;
2976 if ( strcmp ( name, "gor_struc")==0)return 1;
2977 if ( strcmp ( name, "stockholm_aln")==0)return 1;
2981 int is_struc_in_format_list ( char *name)
2983 if ( strcmp ( name, "rna_number")==0)return 1;
2984 if ( strcmp ( name, "fasta_seq")==0)return 1;
2987 char *format_name2aln_format_name (char *name)
2989 if ( strm (name, "gcg"))sprintf (name, "msf");
2990 else if ( strm (name, "fasta"))sprintf (name, "fasta_aln");
2993 int is_out_format_list ( char *name)
2995 return main_output (NULL, NULL, NULL, name, NULL);
2998 int is_struc_out_format_list ( char *name)
3000 return main_output (NULL, NULL, NULL, name, NULL);
3003 /**************************************************************************************************/
3004 /*************************************REFORMAT UTIL*************************************************/
3005 /**************************************************************************************************/
3007 /*************************************REFORMAT IN**************************************************/
3008 /**************************************************************************************************/
3009 /*******************************************************************************************/
3014 /***************************************************************************************** */
3016 /*******************************************************************************************/
3021 /***************************************************************************************** */
3023 Weights* get_amps_sd_scores ( char *fname)
3034 buf=vcalloc ( 1001, sizeof (char));
3035 buf2=vcalloc ( 1001, sizeof (char));
3037 fp=vfopen ( fname, "r");
3038 set_fp_id ( fp, "Index");
3039 buf=fgets ( buf, 1000, fp);
3040 fscanf ( fp, "%s", buf2);
3043 while ( isalnum(buf2[0]) && !isalpha(buf2[0]))
3046 buf=fgets ( buf, 1000, fp);
3047 fscanf ( fp, "%s", buf2);
3051 W=declare_weights (nseq);
3053 fp=vfopen ( fname, "r");
3054 set_fp_id ( fp, "Index");
3055 buf=fgets ( buf, 1000, fp);
3056 fscanf ( fp, "%s", buf2);
3059 while ( isalnum(buf2[0]) && !isalpha(buf2[0]))
3061 fp=set_fp_after_char (fp, '>');
3062 fscanf ( fp, "%s",W->seq_name[a]);
3063 buf=fgets ( buf, 1000, fp);
3064 fscanf ( fp, "%s", buf2);
3067 buf=fgets ( buf, 1000, fp);
3071 for ( e=0; e< 16; e++)
3073 c=fscanf ( fp, "%f", &array[e]);
3081 W->PW_ID[b][a]=W->PW_ID[a][b]=array[9];
3082 W->PW_SD[b][a]=W->PW_SD[a][b]=array[14];
3087 sprintf ( W->comments, "SD WEIGHTS GENERATED WITH THE PROGRAM AMPS IN PAIRWISE MODE");
3092 Weights *read_seq_weight (char **name, int nseq, char* seq_weight)
3099 char line[LONG_STRING];
3100 char sname[MAXNAMES];
3103 /*Read sequence weights:
3110 weights must be between 0 and 1;
3112 sequences not in S do not get any weight
3113 sequences in S but not in file get a weight of 1
3115 if ( !is_single_seq_weight_file (seq_weight))
3117 fprintf ( stderr, "\nERROR: File %s is not in Format SINGLE_SEQ_WEIGHT_FORMAT_01 [FATA:%s]", seq_weight,PROGRAM);
3118 myexit (EXIT_FAILURE);
3123 W=declare_weights(nseq);
3124 for ( a=0; a< nseq; a++)
3126 sprintf ( W->seq_name[a], "%s", name[a]);
3129 sprintf ( W->mode, "%s", seq_weight);
3130 fp=vfopen (seq_weight, "r");
3133 while ( fgets( line,LONG_STRING-1, fp))
3135 if ( line[0]=='*' ||line[0]=='#' || isblanc(line));
3138 if (sscanf(line, "%s %f", sname, &w)!=2)continue;
3139 if ( (p=name_is_in_list ( sname, W->seq_name, nseq, MAXNAMES-1))!=-1)
3151 /*******************************************************************************************/
3156 /***************************************************************************************** */
3158 char *** read_rename_file ( char *fname, int code)
3162 char ***convert=NULL;
3164 convert=declare_arrayN(3, sizeof (char),count_n_line_in_file(fname) +1,2,MAXNAMES+1);
3165 fp=vfopen (fname, "r");
3167 if ( code==CODE) while ( fscanf ( fp, "%s %s\n", convert[n][0], convert[n][1])==2)n++;
3168 else if (code==DECODE)while ( fscanf ( fp, "%s %s\n", convert[n][1], convert[n][0])==2)n++;
3173 void get_barton_list_tc_seq ( char *in_file)
3175 FILE *fp, *fp_make, *fp_length, *fp_long;
3189 length=vcalloc ( 1000, sizeof(int));
3190 if ( buf==NULL)buf=vcalloc ( len_buf, sizeof (char));
3191 fp=vfopen (in_file, "r");
3192 fp_long=vfopen ( "barton_seq_list_large", "w");
3193 fp_make=vfopen ( "make_dir", "w");
3194 fp_length=vfopen ( "barton_length", "w");
3195 for ( a=0; a< 9; a++)
3197 sprintf ( name, "barton_nseq%d",a);
3198 fp_small[a]=vfopen ( name, "w");
3205 while ( (c=fgetc(fp))!='#');
3206 while ( (c=fgetc(fp))=='#');
3208 while ( (c=fgetc(fp))!='#')buf[a++]=c;
3211 sprintf ( name, "%s", buf);
3213 while ( (c=fgetc(fp))=='#');
3218 while ( (c=fgetc(fp))!='#' && c!=EOF)
3224 buf=vrealloc ( buf, len_buf*sizeof (char));
3231 nseq=process_barton_entry ( buf,name);
3233 longest=(longest<nseq)?nseq:longest;
3235 if ( nseq<=8) fprintf ( fp_small[nseq], "%s.pep\n", name);
3236 else fprintf ( fp_long, "%s.pep\n",name);
3237 fprintf ( fp_make, "mkdir %s\nmv %s.pep %s\nmv %s.check %s\n", name, name, name, name, name);
3247 for ( a=0; a< 9; a++)vfclose (fp_small[a]);
3249 for ( a=0; a<= longest; a++)fprintf ( fp_length, "%d: %d\n", a, length[a]);
3250 vfclose ( fp_length);
3254 int process_barton_entry (char *buf, char *name)
3264 int min_len_seq=999999;
3271 sprintf ( fname, "%s.pep", name);
3272 sprintf ( com_name, "%s.check",name);
3274 if ( buf2==NULL)buf2=vcalloc ( 10000, sizeof (char));
3276 while (buf[a]!='\0')
3280 a=get_string_line (a,2, buf, buf2);
3281 while ((c=buf[a++])!='*')
3282 if (isalnum (c)|| c=='.' || c=='-')
3284 max_len_seq=(clen> max_len_seq)?clen: max_len_seq;
3285 min_len_seq=(clen< min_len_seq)?clen: min_len_seq;
3289 if ( buf[a]!='\0')a++;
3293 LS=declare_sequence ( min_len_seq, max_len_seq, nseq);
3297 for (a=0, current=0; current< nseq; current++)
3299 a=get_string_line ( a, 1, buf, buf2);
3300 sscanf ( buf2, ">P1;%s", LS->name[current]);
3301 a=get_string_line ( a, 1, buf, buf2);
3304 sprintf ( LS->seq_comment[current],"%s", buf2);
3307 while ( (c=buf[a++])!='*')
3310 LS->seq[current][p++]=tolower (c);
3311 else if ( isgraph(c))
3312 LS->seq[current][p++]=(c);
3317 LA=declare_Alignment(LS);
3318 seq2aln ( LS, LA,rm_gap);
3319 output_fasta_seq (fname,LA);
3320 output_pir_check (com_name,LA->nseq, LA->seq_comment);
3321 free_Alignment ( LA);
3322 free_sequence ( LS, nseq);
3330 Structure *read_rna_struc_number (Alignment *A,char *fname)
3340 SA=declare_sequence ( A->len_aln, A->len_aln, 1);
3341 SA->len[0]=A->len[0];
3342 for ( a=0; a< SA->len[0]; a++)
3344 ST=declare_rna_structure_num (SA);
3347 fp=vfopen ( fname, "r");
3348 fscanf ( fp, "%c\n%d\n",&x, &(ST)->tot_list);
3349 for ( a=0; a<(ST)->tot_list; a++)
3351 fscanf ( fp, "%d %d %d %c %c %f\n", &(ST)->list[a][0],&(ST)->list[a][1],&(ST)->list[a][2], &x, &y, &f);
3357 (ST)->stem[0][0]=(ST)->list[a][0];
3360 else if ( (ST)->stem[(ST)->tot_stem][0]==(ST)->list[a][0]);
3361 else if ( (ST)->stem[(ST)->tot_stem][0]!=(ST)->list[a][0])
3363 (ST)->stem[(ST)->tot_stem][2]=a-1;
3365 (ST)->stem[(ST)->tot_stem][0]=(ST)->list[a][0];
3366 (ST)->stem[(ST)->tot_stem][1]=a;
3369 SA->seq[0][(ST)->list[a][1]]='-';
3370 SA->seq[0][(ST)->list[a][2]]='-';
3372 (ST)->stem[(ST)->tot_stem][2]=a-1;
3374 for ( a=0; a< (ST)->tot_stem; a++)
3377 first=(ST)->stem[a][1];
3378 last=(ST)->stem[a][2];
3379 SA->seq[0][(ST)->list[first][1]]='>';
3380 SA->seq[0][(ST)->list[first][2]]='<';
3381 SA->seq[0][(ST)->list[last][1]]='>';
3382 SA->seq[0][(ST)->list[last][2]]='<';
3388 Structure * declare_rna_structure_num (Sequence *SA)
3391 ST=vcalloc ( 1, sizeof ( Structure));
3392 ST->list=declare_int ( SA->len[0], 3);
3393 ST->stem=declare_int ( SA->len[0], 3);
3396 char ** read_lib_list (char *name, int *n)
3403 lines=file2lines (name);
3406 list=vcalloc (l, sizeof (char*));
3407 for ( n[0]=0,a=1; a<l; a++,b++)
3408 if ( !strstr (lines[a], "TC_LIB_LIST_FORMAT_01"))list[n[0]++]=lines[a];
3413 /*******************************************************************************************/
3418 /***************************************************************************************** */
3419 char ***read_group ( char *file)
3421 /*Format: Fasta like, the name fo the group followed with the name of the sequences
3422 ><Group name> <First Seq> <second seq> ....
3423 Groups must NOT be overlaping
3424 list[group_index][0]="number of sequences"
3425 list[group_index][1]="group name"
3426 list[group_index][2...N]="sequence"
3436 l=measure_longest_line_in_file (file)+1;
3437 buf=vcalloc (l, sizeof (char));
3438 list=vcalloc ( count_n_line_in_file (file )+1, sizeof (char**));
3440 fp=vfopen (file, "r");
3443 while ((c=fgetc(fp))!=EOF)
3445 buf=fgets (buf,l-1, fp);
3446 if ( c=='>')list[a++]=string2list (buf);
3454 static Sequence* get_pdb_sequence_from_field (char *fname, char *field);
3456 Sequence* get_pdb_sequence (char *fname)
3460 if ( (S=get_pdb_sequence_from_field(fname, "SEQRES"))!=NULL);
3461 else if ( (S=get_pdb_sequence_from_field(fname, "ATOM"))!=NULL)
3463 add_warning (stderr,"Warning: Read Sequence from ATOM field in %s [%s:WARNING]", fname, PROGRAM);
3467 add_warning ( stderr, "\nWARNING: failed to extract sequence from %s [%s:WARNING]\n", fname, PROGRAM);
3474 static Sequence* get_pdb_sequence_from_field (char *fname, char *field)
3482 command=vcalloc ( LONG_STRING, sizeof (char));
3483 tp_name=vtmpnam (NULL);
3484 sprintf ( command, "extract_from_pdb -seq_field %s -chain FIRST -infile \'%s\' -mode fasta > %s", field, check_file_exists(fname), tp_name);
3485 // printf("CO: %s\n", command);
3486 // char *x = vcalloc ( LONG_STRING, sizeof (char));
3487 // sprintf(x, "cp %s ~/Desktop/erg.txt", tp_name);
3489 if ( getenv4debug ("DEBUG_EXTRACT_FROM_PDB"))fprintf ( stderr, "\n[DEBUG_EXTRACT_FROM_PDB:get_pdb_seq] %s\n", command);
3490 my_system ( command);
3493 S=get_fasta_sequence ( tp_name, NULL);
3494 if (S==NULL)return NULL;
3496 if ( (pdbid=get_pdb_id (fname))){sprintf ( S->name[0], "%s",pdbid);vfree (pdbid);}
3499 sprintf ( S->file[0], "%s", fname);
3500 S->max_len=S->min_len=S->len[0];
3503 free_sequence (S, -1);
3513 char * get_pdb_file ( char *fname)
3521 file=vcalloc ( sizeof (char),count_n_char_in_file ( fname)+1);
3522 fp=vfopen ( fname, "r");
3523 while ( (c=fgetc(fp))!=EOF)file[a++]=c;
3528 Sequence* get_struc_gor ( char *fname)
3530 int nseq, min_len, max_len;
3540 fp=vfopen ( fname, "r");
3542 while ( (c=fgetc(fp))!=EOF)
3548 fscanf ( fp, "%s %d", name, &len);
3549 if (min_len==-1)min_len=max_len=len;
3552 min_len=(len>min_len)?min_len:len;
3553 max_len=(len>max_len)?len:max_len;
3560 S=declare_sequence ( min_len, max_len+1,nseq);
3563 fp=vfopen (fname,"r");
3564 while ( (c=fgetc(fp))!=EOF)
3569 fscanf ( fp, "%s %d\n",S->name[S->nseq], &(S->len[S->nseq]));
3571 while ( (c=fgetc(fp))!='\n');
3573 for ( a=0; a<S->len[S->nseq]; a++)
3574 fscanf ( fp, " %*c %c %*f %*f %*f\n",&(S->seq[S->nseq][a]));
3576 S->seq[S->nseq][a]='\0';
3577 while ( (c=fgetc(fp))!='!' && c!=EOF);
3587 Sequence* get_sequence_dali (char *fname)
3598 int min_len_seq=999999;
3601 if ((fp=vfopen (fname,"r"))==NULL)
3602 {printf ( "\nCOULDN'T OPEN %s",fname);
3603 myexit(EXIT_FAILURE);
3611 fscanf (fp, "%s",name);
3612 while (!isdigit(c=fgetc(fp)) && c!=EOF)
3613 if (isalnum (c) || c=='.' || c=='-')
3615 max_len_seq=(clen> max_len_seq)?clen: max_len_seq;
3616 min_len_seq=(clen< min_len_seq)?clen: min_len_seq;
3625 LS=declare_sequence ( min_len_seq, max_len_seq+1,nseq);
3628 fp=vfopen (fname,"r");
3637 fscanf_seq_name (fp, LS->name[current]);
3639 while (!isdigit(c=fgetc(fp)) && c!=EOF)
3642 LS->seq[current][p++]=tolower (c);
3644 LS->seq[current][p++]='-';
3646 LS->seq[current][p++]='-';
3648 LS->seq[current][p]='\0';
3649 LS->len[current]=strlen ( LS->seq[current]);
3662 Sequence* get_dialign_sequence (char *fname)
3673 int min_len_seq=999999;
3677 buf=vcalloc ( 1000, sizeof (char));
3678 if ((fp=vfopen (fname,"r"))==NULL)
3679 {printf ( "\nCOULDN'T OPEN %s",fname);
3680 myexit(EXIT_FAILURE);
3686 {fscanf (fp, "%s",name);
3688 buf=fgets ( buf, 1000, fp);
3689 while ((c=fgetc(fp))!='>' && c!=EOF && c!=' ' && c!='\t')
3690 if (isalnum (c)|| is_gap(c))
3692 max_len_seq=(clen> max_len_seq)?clen: max_len_seq;
3693 min_len_seq=(clen< min_len_seq)?clen: min_len_seq;
3702 LS=declare_sequence ( min_len_seq, max_len_seq, nseq);
3705 fp=vfopen (fname,"r");
3714 fscanf_seq_name (fp, LS->name[current]);
3715 l=strlen ( LS->name[current]);
3716 if ( LS->name[current][l-1]==','||LS->name[current][l-1]==',')LS->name[current][l-1]='\0';
3717 buf=fgets ( buf, 1000, fp);
3719 while ((c=fgetc(fp))!='>' && c!=EOF && c!=EOF && c!=' ' && c!='\t')
3721 LS->seq[current][p++]=tolower (c);
3722 else if ( isgraph(c))
3723 LS->seq[current][p++]=(c);
3724 LS->seq[current][p]='\0';
3725 LS->len[current]=strlen ( LS->seq[current]);
3736 Sequence* get_pima_sequence (char *fname)
3748 int min_len_seq=999999;
3749 int nseq=0, l=0, len=0;
3753 sprintf ( prefix, "%s",fname);
3755 buf=strstr(prefix, "-");
3757 len=strlen (prefix);
3761 buf=vcalloc ( 1000, sizeof (char));
3762 if ((fp=vfopen (fname,"r"))==NULL)
3763 {printf ( "\nCOULDN'T OPEN %s",fname);
3764 myexit(EXIT_FAILURE);
3771 fscanf_seq_name (fp,name);
3772 if ( strlen(name)>=len && strncmp ( name, prefix, len)==0)
3779 buf=fgets ( buf, 1000, fp);
3780 while ((c=fgetc(fp))!='>' && c!=EOF)
3781 if (isalnum (c)|| is_gap(c))
3783 max_len_seq=(clen> max_len_seq)?clen: max_len_seq;
3784 min_len_seq=(clen< min_len_seq)?clen: min_len_seq;
3794 LS=declare_sequence ( min_len_seq, max_len_seq, nseq);
3797 fp=vfopen (fname,"r");
3805 fscanf_seq_name (fp,LS->name[current]);
3806 if ( strlen(LS->name[current])>=len && strncmp ( LS->name[current], prefix, len)==0)
3810 buf2=strstr (LS->name[current], ".");
3811 if ( buf2!=NULL) buf2[0]='\0';
3813 l=strlen ( LS->name[current]);
3814 if ( LS->name[current][l-1]==','||LS->name[current][l-1]==',')LS->name[current][l-1]='\0';
3815 buf=fgets ( buf, 1000, fp);
3817 while ((c=fgetc(fp))!='>' && c!=EOF)
3819 LS->seq[current][p++]=tolower (c);
3820 else if ( isgraph(c))
3821 LS->seq[current][p++]=(c);
3822 LS->seq[current][p]='\0';
3823 LS->len[current]=strlen ( LS->seq[current]);
3835 Sequence* perl_reformat2fasta (char *perl_command, char *fname)
3840 file=vtmpnam (NULL);
3842 check_program_is_installed ( perl_command,"", perl_command,EMAIL,IS_FATAL);
3843 sprintf ( command, "%s %s > %s", perl_command, fname, file);
3844 my_system ( command);
3845 return get_fasta_sequence (file, NULL);
3847 Sequence* get_fasta_sequence_num (char *fname, char *comment_out)
3869 buffer=vcalloc (1000, sizeof (char));
3870 name=vcalloc ( 100, sizeof (char));
3872 nseq=count_n_char_x_in_file(fname, '>');
3873 min_len_seq=max=count_n_char_in_file(fname);
3874 sub=vcalloc (max+1, sizeof (int));
3876 fp=vfopen (fname,"r");
3884 fscanf_seq_name (fp,name);
3885 while ((c=fgetc(fp))!='\n' && c!=EOF);
3886 while ((c=fgetc(fp))!='>' && c!=EOF)
3887 if (isalnum (c)|| is_gap(c))
3889 max_len_seq=(clen> max_len_seq)?clen: max_len_seq;
3890 min_len_seq=(clen< min_len_seq)?clen: min_len_seq;
3899 LS=declare_sequence ( min_len_seq, max_len_seq,nseq);
3903 fp=vfopen (fname,"r");
3911 fscanf_seq_name (fp,LS->name[current]);
3912 l=strlen ( LS->name[current]);
3913 if ( LS->name[current][l-1]==','||LS->name[current][l-1]==';')LS->name[current][l-1]='\0';
3914 LS->name[current]=translate_name ( LS->name[current]);
3916 while ((c=fgetc(fp))!='\n' && c!=EOF && a<(COMMENT_SIZE-1))LS->seq_comment[current][a++]=c;
3917 LS->seq_comment[current][a]='\0';
3921 while ((c=fgetc(fp))!='>' && c!=EOF)
3924 LS->seq[current][p++]=c;
3926 LS->seq[current][p++]=c;
3928 LS->seq[current][p]='\0';
3929 LS->len[current]=strlen ( LS->seq[current]);
3947 Sequence *get_tree_file_list ( char *fname)
3956 list=file2list (fname, "\n");
3957 fp=vfopen (tmp, "w");
3959 while (list[a] && !isspace(list[a][1][0]))
3962 s=file2string (list[a][1]);
3963 fprintf ( fp, ">%s\n%s\n", list[a][1], (s)?s:"");
3967 free_arrayN((void ***)list, 3);
3968 return get_fasta_tree (tmp, NULL);
3970 Sequence *get_file_list ( char *fname)
3979 list=file2list (fname, "\n");
3980 fp=vfopen (tmp, "w");
3982 while (list[a] && !isspace(list[a][1][0]))
3985 fprintf ( fp, ">%s\n", list[a][1]);
3989 free_arrayN((void ***)list, 3);
3990 return get_fasta_sequence (tmp, NULL);
3992 Sequence*get_fasta_tree (char *fname, char *comment_out)
4014 buffer=vcalloc (1000, sizeof (char));
4015 name=vcalloc ( 100, sizeof (char));
4017 nseq=count_n_char_x_in_file(fname, '>');
4018 min_len_seq=max=count_n_char_in_file(fname);
4019 sub=vcalloc (max+1, sizeof (int));
4021 fp=vfopen (fname,"r");
4029 fscanf_seq_name (fp,name);
4030 while ((c=fgetc(fp))!='\n' && c!=EOF);
4031 while ((c=fgetc(fp))!='>' && c!=EOF)
4034 max_len_seq=(clen> max_len_seq)?clen: max_len_seq;
4035 min_len_seq=(clen< min_len_seq)?clen: min_len_seq;
4044 LS=declare_sequence ( min_len_seq, max_len_seq,nseq);
4048 fp=vfopen (fname,"r");
4056 fscanf_seq_name (fp,LS->name[current]);
4057 l=strlen ( LS->name[current]);
4058 if ( LS->name[current][l-1]==','||LS->name[current][l-1]==';')LS->name[current][l-1]='\0';
4059 LS->name[current]=translate_name ( LS->name[current]);
4061 while ((c=fgetc(fp))!='\n' && c!=EOF && a<(COMMENT_SIZE-1))LS->seq_comment[current][a++]=c;
4062 LS->seq_comment[current][a]='\0';
4066 while ((c=fgetc(fp))!='>' && c!=EOF)
4068 LS->seq[current][p++]=c;
4071 LS->seq[current][p]='\0';
4072 LS->len[current]=strlen ( LS->seq[current]);
4092 Sequence* get_fasta_sequence_raw (char *fname, char *comment_out)
4114 buffer=vcalloc (1000, sizeof (char));
4115 name=vcalloc ( 100, sizeof (char));
4117 nseq=count_n_char_x_in_file(fname, '>');
4118 min_len_seq=max=count_n_char_in_file(fname);
4119 sub=vcalloc (max+1, sizeof (int));
4121 fp=vfopen (fname,"r");
4129 fscanf_seq_name (fp,name);
4130 while ((c=fgetc(fp))!='\n' && c!=EOF);
4131 while ((c=fgetc(fp))!='>' && c!=EOF)
4134 max_len_seq=(clen> max_len_seq)?clen: max_len_seq;
4135 min_len_seq=(clen< min_len_seq)?clen: min_len_seq;
4144 LS=declare_sequence ( min_len_seq, max_len_seq,nseq);
4148 fp=vfopen (fname,"r");
4156 fscanf_seq_name (fp,LS->name[current]);
4157 l=strlen ( LS->name[current]);
4158 if ( LS->name[current][l-1]==','||LS->name[current][l-1]==';')LS->name[current][l-1]='\0';
4159 LS->name[current]=translate_name ( LS->name[current]);
4161 while ((c=fgetc(fp))!='\n' && c!=EOF && a<(COMMENT_SIZE-1))LS->seq_comment[current][a++]=c;
4162 LS->seq_comment[current][a]='\0';
4166 while ((c=fgetc(fp))!='>' && c!=EOF)
4169 if (c!='\n')LS->seq[current][p++]=c;
4172 LS->seq[current][p]='\0';
4173 LS->len[current]=strlen ( LS->seq[current]);
4192 Sequence* get_fasta_sequence (char *fname, char *comment_out)
4218 buffer=vcalloc (1000, sizeof (char));
4219 name=vcalloc ( 10000, sizeof (char));
4221 nseq=count_n_char_x_in_file(fname, '>');
4222 if (disk==1 || get_int_variable ("use_disk") || getenv ("SEQ_ON_DISK_4_TCOFFEE")){disk=1;}
4225 vfree (buffer); vfree (name);
4229 min_len_seq=max=count_n_char_in_file(fname);
4230 sub=vcalloc (max+1, sizeof (char));
4232 fp=vfopen (fname,"r");
4241 fscanf_seq_name (fp,name);
4242 while ((c=fgetc(fp))!='\n' && c!=EOF);
4243 while ((c=fgetc(fp))!='>' && c!=EOF)
4245 if (isalnum (c)|| is_gap(c))
4249 if (strm (sub, "PDB"))
4251 pdb_name=get_pdb_struc(name,0, 0);
4252 pdb_S=get_pdb_sequence (pdb_name);
4255 clen=strlen( pdb_S->seq[0]);
4256 free_sequence ( pdb_S,1);
4263 max_len_seq=(clen> max_len_seq)?clen: max_len_seq;
4264 min_len_seq=(clen< min_len_seq)?clen: min_len_seq;
4276 LS=declare_sequence ( min_len_seq, max_len_seq,nseq);
4279 LS=declare_sequence (0,0,nseq);
4280 for (a=0; a<nseq; a++)LS->seq[a]=NULL;
4284 fp=vfopen (fname,"r");
4292 coor+=fscanf_seq_name (fp, LS->name[current]);
4295 l=strlen ( LS->name[current]);
4296 if ( LS->name[current][l-1]==','||LS->name[current][l-1]==';')LS->name[current][l-1]='\0';
4297 LS->name[current]=translate_name ( LS->name[current]);
4299 while ((c=fgetc(fp))!='\n' && c!=EOF && a<(COMMENT_SIZE-1)){LS->seq_comment[current][a++]=c;coor++;}
4302 LS->seq_comment[current][a]='\0';
4305 while ((c=fgetc(fp))!='>' && c!=EOF)
4311 if (p==0)LS->dc[current][0]=coor;
4313 if (disk==0)LS->seq[current][p++]=c;
4317 LS->dc[current][1]=coor;
4321 if ( disk==0)LS->seq[current][p]='\0';
4323 if (LS->seq[current] && strm (LS->seq[current], "PDB"))
4326 pdb_name=get_pdb_struc(LS->name[current],0, 0);
4327 pdb_S=get_pdb_sequence (pdb_name);
4330 sprintf ( LS->seq[current], "%s", pdb_S->seq[0]);
4331 clen=strlen( pdb_S->seq[0]);
4332 free_sequence ( pdb_S, 1);
4336 add_warning (stderr, "WARNING: Could not fetch PDB file: %s", pdb_name);
4356 //LS=clean_sequence (LS);
4361 Sequence* get_sub_fasta_sequence (char *fname, char *comment_out)
4382 nseq=count_n_char_x_in_file(fname, '>');
4383 min_len_seq=max=count_n_char_in_file(fname);
4384 sub=vcalloc (max+1, sizeof (int));
4385 buf=vcalloc ( max+1, sizeof (char));
4386 fp=vfopen (fname,"r");
4394 fscanf_seq_name (fp,name);
4395 while ((c=fgetc(fp))!='\n' && c!=EOF);
4396 buf=fgets ( buf,max, fp);
4397 while ((c=fgetc(fp))!='>' && c!=EOF)
4398 if (isalnum (c)|| is_gap(c))
4400 max_len_seq=(clen> max_len_seq)?clen: max_len_seq;
4401 min_len_seq=(clen< min_len_seq)?clen: min_len_seq;
4410 LS=declare_sequence ( min_len_seq, max_len_seq,nseq);
4413 fp=vfopen (fname,"r");
4421 fscanf_seq_name (fp,LS->name[current]);
4422 l=strlen ( LS->name[current]);
4423 if ( LS->name[current][l-1]==','||LS->name[current][l-1]==';')LS->name[current][l-1]='\0';
4424 LS->name[current]=translate_name ( LS->name[current]);
4425 while ((c=fgetc(fp))!='\n' && c!=EOF);
4428 while ((c=fgetc(fp))!='>' && c!=EOF)
4431 LS->seq[current][p++]=tolower (c);
4433 LS->seq[current][p++]=(c);
4436 LS->seq[current][p]='\0';
4437 LS->len[current]=strlen ( LS->seq[current]);
4454 Sequence* get_pir_sequence (char *fname, char *comment_out)
4466 int min_len_seq=999999;
4470 buf=vcalloc ( 1000, sizeof (char));
4471 if ((fp=vfopen (fname,"r"))==NULL)
4472 {printf ( "\nCOULDN'T OPEN %s",fname);
4473 myexit(EXIT_FAILURE);
4480 if ( (c=fgetc(fp))=='P')while ( (c=fgetc(fp))!=';');
4481 else ungetc ( c, fp);
4482 fscanf_seq_name (fp,name);
4484 buf=fgets ( buf, 1000, fp);
4485 while ((c=fgetc(fp))!='>' && c!=EOF)
4486 if (isalnum (c)|| is_gap(c))
4488 max_len_seq=(clen> max_len_seq)?clen: max_len_seq;
4489 min_len_seq=(clen< min_len_seq)?clen: min_len_seq;
4500 LS=declare_sequence ( min_len_seq, max_len_seq,nseq);
4503 fp=vfopen (fname,"r");
4511 if ( (c=fgetc(fp))=='P')while ( (c=fgetc(fp))!=';');
4512 else ungetc ( c, fp);
4514 fscanf_seq_name (fp,LS->name[current]);
4516 l=strlen ( LS->name[current]);
4517 if ( LS->name[current][l-1]==','||LS->name[current][l-1]==',')LS->name[current][l-1]='\0';
4518 LS->name[current]=translate_name ( LS->name[current]);
4519 buf=fgets ( buf, 1000, fp);
4521 LS->seq_comment[current]=fgets ( LS->seq_comment[current],COMMENT_SIZE-1, fp);
4522 LS->seq_comment[current][strlen(LS->seq_comment[current])-1]='\0';
4524 while ((c=fgetc(fp))!='>' && c!=EOF)
4526 LS->seq[current][p++]=tolower (c);
4527 else if ( !isspace(c) && c!='*')
4528 LS->seq[current][p++]=(c);
4529 LS->seq[current][p]='\0';
4530 LS->len[current]=strlen ( LS->seq[current]);
4538 if (comment_out!=NULL) output_pir_check ( comment_out,LS->nseq, LS->seq_comment);
4542 Sequence* get_gor_sequence (char *fname, char *comment_out)
4554 int min_len_seq=99999;
4558 buf=vcalloc ( 1000, sizeof (char));
4559 if ((fp=vfopen (fname,"r"))==NULL)
4560 {printf ( "\nCOULDN'T OPEN %s",fname);
4561 myexit(EXIT_FAILURE);
4568 fscanf_seq_name (fp,name);
4570 buf=fgets ( buf, 1000, fp);
4571 while ((c=fgetc(fp))!='!' && c!=EOF)
4572 if (isalnum (c)|| is_gap(c))
4574 max_len_seq=(clen> max_len_seq)?clen: max_len_seq;
4575 min_len_seq=(clen< min_len_seq)?clen: min_len_seq;
4584 LS=declare_sequence ( min_len_seq, max_len_seq,nseq);
4587 fp=vfopen (fname,"r");
4597 fscanf_seq_name (fp,LS->name[current]);
4598 LS->name[current]=translate_name ( LS->name[current]);
4599 buf=fgets ( buf, 1000, fp);
4602 while ((c=fgetc(fp))!='!' && c!=EOF)
4603 if (isalnum (c)|| is_gap(c))
4604 LS->seq[current][p++]=tolower (c);
4606 LS->seq[current][p]='\0';
4607 LS->len[current]=strlen ( LS->seq[current]);
4618 Sequence* get_swissprot_sequence (char *fname, char *comment_out)
4625 int len, max_len_seq=0, min_len_seq=0;
4627 if ( !check_file_exists(fname))
4628 {printf ( "\nCOULDN'T OPEN %s",fname);
4629 myexit(EXIT_FAILURE);
4632 buf=vcalloc (LONG_STRING+1, sizeof (char));
4634 while ( (fp=find_token_in_file(fname,fp,"\nSQ")))
4637 fgets (buf, LONG_STRING, fp);
4639 while ((c=fgetc(fp))!='/')if(isalpha(c))len++;
4640 if ( max_len_seq==0)max_len_seq=min_len_seq=len;
4643 max_len_seq=MAX(len, max_len_seq);
4644 min_len_seq=MIN(len, min_len_seq);
4648 LS=declare_sequence ( min_len_seq, max_len_seq,nseq);
4652 while ( (fp=find_token_in_file(fname,fp,"\nID")))
4654 fscanf_seq_name (fp, LS->name[LS->nseq]);
4655 fp=find_token_in_file(fname,fp,"\nSQ");
4656 fgets (buf, LONG_STRING, fp);
4657 while ((c=fgetc(fp))!='/')if (isalpha(c))LS->seq[LS->nseq][LS->len[LS->nseq]++]=c;
4658 LS->seq[LS->nseq][LS->len[LS->nseq]]='\0';
4665 int fscanf_seq_name ( FILE *fp, char *sname)
4669 if ( !sname) return 0;
4671 if ( !name)name=vcalloc ( 10000, sizeof (char));
4672 fscanf (fp, "%s", name);
4674 if ( strlen (name)>MAXNAMES)
4675 add_warning (stderr, "\nWARNING: Seq Name Too long: [%s]. Truncated to %d", name, MAXNAMES);
4676 name[MAXNAMES]='\0';
4677 sprintf ( sname, "%s", name);
4681 /*******************************************************************************************/
4686 /***************************************************************************************** */
4687 void undump_msa ( Alignment *A, char *tmp)
4694 if ( !A || !tmp || !check_file_exists (tmp))return;
4695 m=measure_longest_line_in_file (tmp );
4696 A=realloc_aln2 ( A,A->max_n_seq,m+1);
4698 buf=vcalloc (m+1, sizeof (char));
4699 fp=vfopen (tmp, "r");
4700 while (fscanf (fp, "%d %s\n", &index, buf)==2)
4702 sprintf ( A->seq_al[index], "%s", buf);
4707 void dump_msa ( char *file,Alignment *A, int nseq, int *lseq)
4711 fp=vfopen (file, "w");
4712 for (a=0; a<nseq; a++)
4713 fprintf ( fp, "%d %s\n", lseq[a], A->seq_al[lseq[a]]);
4717 void read_aln (char *file_name, Alignment *A)
4723 tmp_name=vtmpnam (NULL);
4725 if (printf_system ( "clustalw_aln2fasta_aln.pl %s > %s",file_name, tmp_name)!=EXIT_SUCCESS)
4727 printf_exit ( EXIT_FAILURE, stderr, "Could Not Read File %s [FATAL:%s]\n", file_name, PROGRAM);
4731 S=get_fasta_sequence ( tmp_name,NULL);
4732 A=seq2aln (S, A, 0);
4736 void read_stockholm_aln (char *file_name, Alignment *A)
4742 tmp_name=vtmpnam (NULL);
4743 if (printf_system ( "clustalw_aln2fasta_aln.pl %s > %s",file_name, tmp_name)!=EXIT_SUCCESS)
4745 printf_exit ( EXIT_FAILURE, stderr, "Could Not Read File %s [FATAL:%s]\n", file_name, PROGRAM);
4750 S=get_fasta_sequence ( tmp_name,NULL);
4751 for (a=0; a<S->nseq; a++)
4753 if (strstr (S->name[a], "_stockholm"))
4755 substitute ( S->name[a], "_stockholmspace_", " ");
4756 substitute ( S->name[a], "_stockholmhasch_", "#");
4759 A=seq2aln (S, A, 0);
4763 Alignment* read_blast_aln ( char *file_name, Alignment *A)
4770 if ( !(type=is_blast_file (file_name)))
4772 myexit (EXIT_FAILURE);
4774 tmp_name=vtmpnam ( NULL);
4775 if (type==BLAST_TXT)
4777 printf_system("cat %s | blast_aln2fasta_aln.pl | fasta_aln2fasta_aln_unique_name.pl >%s", file_name, tmp_name);
4779 else if (type==BLAST_XML)
4782 printf_system("blast_xml2fasta_aln.pl %s >%s", file_name, tmp_name);
4785 main_read_aln (tmp_name, A);
4790 void read_number_aln ( char *file_name, Alignment *A)
4807 fp=vfopen ( file_name, "r");
4809 fname=vtmpnam(NULL);
4810 fp2=vfopen ( fname, "w");
4811 while ( (c=fgetc(fp))!=EOF)
4813 fprintf ( fp2, "%c", c);
4819 /*1 Count The number of sequences*/
4820 fp=vfopen ( fname, "r");
4821 buf=vfgets ( buf,fp);
4822 if ( !isblanc (buf));
4823 while ( isblanc (buf))
4825 buf=vfgets ( buf, fp);
4827 while (!isblanc (buf))
4829 buf=vfgets ( buf,fp);
4831 while ( !isalnum ((c=fgetc(fp))))
4834 buf=vfgets ( buf,fp);
4837 if ( c!='\n')ungetc(c,fp);
4839 while ( isalnum ((c=fgetc(fp))))
4843 while ( isgraph ((c=fgetc(fp))));
4845 buf=vfgets ( buf, fp);
4851 max_len=count_n_char_in_file(fname)/nseq;
4852 A=realloc_alignment2( A, nseq+1, max_len+1);
4857 fp=vfopen ( fname, "r");
4858 buf=vfgets ( buf, fp);
4859 if ( !isblanc (buf))sprintf (A->aln_comment[n_comment++], "%s", buf);
4860 while ( isblanc (buf))
4862 buf=vfgets ( buf,fp);
4864 while (!isblanc (buf))
4866 buf=vfgets ( buf, fp);
4867 sprintf ( A->aln_comment[n_comment++], "%s", buf);
4870 while ( !isalnum ((c=fgetc(fp))))
4873 buf=vfgets ( buf, fp);
4877 if ( c!='\n')ungetc(c,fp);
4879 while ( isalnum ((c=fgetc(fp))))
4883 fscanf_seq_name (fp, A->name[A->nseq]);
4885 if ( name_is_in_list (A->name[A->nseq], A->name, A->nseq, 100)!=-1)
4887 fprintf ( stderr, "\nWARNING (read_number_aln): Sequence %s Duplicated in File %s ", A->name[A->nseq], A->file[A->nseq]);
4888 if (!getenv("ALLOW_DUPLICATE"))
4890 fprintf ( stderr, " [FATAL:%s]\n", PROGRAM);
4891 myexit (EXIT_FAILURE);
4895 buf=vfgets ( buf,fp);
4902 if ((fp=vfopen ( fname, "r"))==NULL)
4903 printf ( "\nCOULDN'T READ %s", fname);
4905 ptr_aln=vcalloc ( A->nseq, sizeof(int));
4908 while ( (c=fgetc(fp))!='\n');
4909 if ( (c=fgetc(fp))=='\n')
4912 while ( !isalnum(c=fgetc(fp)));
4917 while(tot< A->nseq && c!=EOF)
4920 while ( !isgraph (c=fgetc(fp)) && c!=EOF);
4921 if ( c!=EOF)ungetc(c, fp);
4922 while ( isgraph((buf[b++]=fgetc(fp))));
4924 for ( a=-1,d=0; d< A->nseq; d++)
4925 if ( strcmp (A->name[d], buf)==0)
4930 if ( a==-1) while ( (c=fgetc(fp))!='\n' && c!=EOF);
4933 while ( (c=fgetc(fp))!='\n')
4935 if ( isgraph(c) || is_gap(c))
4937 c=(A->residue_case==2)?c:tolower(c);
4939 if (!isspace(c))A->seq_al[a][ptr_aln[a]++]=c;
4944 while ( !isalnum(c=getc(fp)) && c!=EOF);
4952 for ( a=0; a< A->nseq; a++)
4953 {A->seq_al[a][ptr_aln[a]]='\0';
4958 A->len_aln= strlen(A->seq_al[0]);
4965 void read_amps_aln ( char *in_file, Alignment *A)
4968 int a, b, c, cont=1;
4969 A->nseq=get_amps_seq_name ( A->name, in_file);
4971 fp=vfopen ( in_file, "r");
4972 fp=set_fp_id(fp, "1*");
4973 while ( (c=fgetc(fp))!='\n');
4982 for ( a=0; a<A->nseq; a++)
4983 A->seq_al[a][b]='\0';
4990 for ( a=0; a< A->nseq; a++)
4993 if ( c==' ')A->seq_al[a][b]='-';
5000 while ((c=fgetc(fp))!='\n');
5011 int get_amps_seq_name ( char **name, char* fname)
5016 fp=vfopen ( fname, "r");
5017 fp=set_fp_id ( fp, "Index");
5018 while ( (fgetc(fp))!='\n');
5019 while ( isspace(fgetc(fp)))
5020 {fscanf (fp, "%*d >%s", name[nseq++]);
5021 while ( (fgetc(fp))!='\n');
5026 Alignment * read_gotoh_aln ( char *fname, Alignment *A)
5034 char buf2[VERY_LONG_STRING+1];
5035 char buf3[VERY_LONG_STRING+1];
5036 char buf4[VERY_LONG_STRING+1];
5044 if ( !check_file_exists (fname))return NULL;
5045 fp=vfopen ( fname, "r");
5047 /*1 GET THE NUMBER OF SEQUENCES*/
5049 buf=vcalloc ( VERY_LONG_STRING+1, sizeof (char));
5050 while ( isblanc (buf=fgets ( buf, VERY_LONG_STRING, fp)));
5051 while (!isblanc (buf=fgets ( buf, VERY_LONG_STRING, fp)));
5052 while ( isblanc (buf=fgets ( buf, VERY_LONG_STRING, fp)));
5053 while ( !isblanc ( buf) && buf!=NULL)
5056 d=sscanf ( buf, "%d %s %s %s", &a, buf2, A->name[A->nseq],buf3);
5059 if ( name_is_in_list (A->name[A->nseq], A->name, A->nseq, 100)!=-1)
5061 fprintf ( stderr, "\nWARNING (get_amps_seq_name): Sequence %s Duplicated in File %s ", A->name[A->nseq], A->file[A->nseq]);
5062 if (!getenv("ALLOW_DUPLICATE"))
5064 fprintf ( stderr, " [FATAL:%s]\n", PROGRAM);
5065 myexit (EXIT_FAILURE);
5069 fgets(buf, VERY_LONG_STRING, fp);
5074 /*2 Get the MAX Len and Reallocate*/
5075 max_len=count_n_char_in_file(fname)/nseq;
5076 A=realloc_aln2( A, nseq+1, max_len+1);
5077 /*3 Get The Sequences Names*/
5079 fp=vfopen ( fname, "r");
5080 while ( isblanc (buf=fgets ( buf, VERY_LONG_STRING, fp)));
5081 while (!isblanc (buf=fgets ( buf, VERY_LONG_STRING, fp)));
5082 while ( isblanc (buf=fgets ( buf, VERY_LONG_STRING, fp)));
5083 while ( !isblanc ( buf) && buf!=NULL)
5086 d=sscanf ( buf, "%d %s %s %s", &a, buf2, A->name[A->nseq],buf3);
5089 if ( d==4)sprintf (A->name[A->nseq],"%s", buf3);
5091 fgets(buf, VERY_LONG_STRING, fp);
5098 fp=vfopen ( fname, "r");
5100 buf=vcalloc ( VERY_LONG_STRING+1, sizeof (char));;
5101 ptr_aln=vcalloc ( A->nseq, sizeof(int));
5103 while ( isblanc (buf=fgets ( buf, VERY_LONG_STRING, fp)));
5104 while (!isblanc (buf=fgets ( buf, VERY_LONG_STRING, fp)));
5107 while ( isblanc (buf=fgets ( buf, VERY_LONG_STRING, fp)));
5115 e=sscanf (buf, "%d %s %s %s", &e, buf2, buf3, buf4);
5116 if ( e==4)sprintf( buf3, "%s", buf4);
5119 for ( d=0; d< A->nseq; d++)
5122 if ( strcmp (A->name[d], buf3)==0)
5128 if ( buf2[l-1]=='|')l--;
5133 if ( isgraph (buf2[b]))
5134 A->seq_al[a][ptr_aln[a]++]=(A->residue_case==2)?buf2[b]:tolower (buf2[b]);
5136 buf=fgets(buf, VERY_LONG_STRING, fp);
5140 buf=fgets(buf, VERY_LONG_STRING, fp);
5141 while ( isblanc (buf) && buf!=NULL)
5143 buf=fgets ( buf, VERY_LONG_STRING, fp);
5152 for ( a=0; a< A->nseq; a++)
5153 {A->seq_al[a][ptr_aln[a]]='\0';
5156 A->len_aln= strlen(A->seq_al[0]);
5160 for ( a=0; a< A->nseq; a++)
5162 for ( b=0; b< A->len_aln; b++)
5163 A->len[a]+=1-is_gap(A->seq_al[a][b]);
5165 for ( a=0, b=0; a< A->len_aln; a++)
5167 if ( !is_gap(A->seq_al[0][a]) &&!is_gap(A->seq_al[1][a]))b++;
5176 void read_msf_aln ( char *fname, Alignment *A)
5182 tmp_name=vtmpnam(NULL);
5183 sprintf ( command, "msf_aln2fasta_aln.pl %s > %s", fname, tmp_name);
5185 if ( my_system (command)!=EXIT_SUCCESS)
5187 fprintf ( stderr, "\nERROR: file %s does not have a legal msf format [FATAL:%s]", fname,PROGRAM);
5188 myexit (EXIT_FAILURE);
5191 S=get_fasta_sequence ( tmp_name,NULL);
5192 A=seq2aln (S, A, 0);
5197 /**************************************************************************************************/
5198 /*************************************REFORMAT OUT*************************************************/
5199 /**************************************************************************************************/
5200 /*******************************************************************************************/
5203 /* OUTPUT MATRICES */
5205 /***************************************************************************************** */
5209 int output_freq_mat ( char *outfile, Alignment *A)
5211 function documentation: start
5213 int output_freq_mat ( char *outfile, Aligmnent *A)
5215 This function counts the number of residues in each column of an alignment (Prot)
5216 It outputs these values in the following format
5222 This format can be piped into:
5223 The routine used for computing the p-value gmat-inf-gc-v2c
5225 function documentation: end
5233 freq_mat=aln2count_mat (A);
5235 fp=vfopen ( outfile, "w");
5236 for ( b=0; b< 26; b++)
5238 fprintf (fp, "%c |", 'A'+b);
5239 for ( a=0; a< A->len_aln; a++)fprintf (fp,"%d ", freq_mat[b][a]);
5242 fprintf (fp, "- |");
5243 for ( a=0; a< A->len_aln; a++)fprintf (fp,"%d ", freq_mat[26][a]);
5245 free_int (freq_mat, -1);
5249 /*******************************************************************************************/
5252 /* OUTPUT P-Values */
5254 /***************************************************************************************** */
5255 float output_maln_pval ( char *outfile, Alignment *A)
5258 function documentation: start
5259 float output_maln_pval ( char *outfile, Aligmnent *A)
5261 This function outputs the p-value of a multiple alignmnet as described
5262 in Hertz, Stormo, Bioinformatics, 15-7/8, 563/577
5263 ftp beagle.colorado.edu /pub/cosensus
5265 packages/consensus/gmat-inf-gc-v2c
5268 The routine used for computing the p-value is the program gmat-inf-gc-v2c
5269 function documentation: end
5277 char command[LONG_STRING];
5278 char string[STRING];
5280 result=vtmpnam (NULL);
5282 output_freq_mat (mat,A);
5283 sprintf ( command, "more %s | gmat-inf-gc-v2c -A abcdefghijklmnopqrstuvwxyz> %s",mat, result);
5284 my_system ( command);
5286 if ( !check_file_exists(result))return 0;
5287 fp=find_token_in_file ( result, NULL, "ln(p-value):");
5289 fscanf ( fp, "%s",string);
5290 value=atof ( string);
5296 fp=vfopen ( outfile, "w");
5297 fprintf ( fp, "%.6f\n", value);
5304 /*******************************************************************************************/
5307 /* OUTPUT WEIGHTS */
5309 /***************************************************************************************** */
5310 int output_seq_weights ( Weights *W, char *wfile)
5315 if ( W==NULL)return 0;
5317 fp=vfopen (wfile, "w");
5318 if ( fp==NULL)return 0;
5321 for ( a=0; a< W->nseq; a++)
5324 fprintf ( fp, "%s %.2f\n", W->seq_name[a],W->SEQ_W[a]);
5329 void output_pw_weights4saga ( Weights *W, float **w_list, char *wfile)
5333 fp=vfopen (wfile, "w");
5335 fprintf ( fp, "%s\n$\n", W->comments);
5336 for ( a=0; a< W->nseq-1; a++)
5338 for (b=a+1; b< W->nseq; b++)
5340 fprintf ( fp, "%s %s %f\n", W->seq_name[a], W->seq_name[b],w_list[a][b]);
5343 fprintf ( fp, "$\n");
5347 FILE * display_weights (Weights *W, FILE *fp)
5354 fprintf ( fp, "\n\nUN-WEIGHTED MODE: EVERY SEQUENCE WEIGHTS 1\n");
5357 fprintf ( fp, "\n\nWEIGHTED MODE:%s\n\n", (W)->mode);
5358 for ( a=0, max_len=0; a< W->nseq; a++)max_len=MAX(max_len, strlen (W->seq_name[a]));
5359 for ( a=0; a< (W->nseq); a++)
5361 fprintf ( fp, "\t%*s %.2f\n", max_len,(W)->seq_name[a],W->SEQ_W[a]);
5363 fprintf ( fp, "\n");
5367 /*******************************************************************************************/
5372 /***************************************************************************************** */
5373 int ** input_similarities (char *file, Alignment *A, char *mode)
5380 char *buf1=NULL, *buf2=NULL;
5385 if ( !check_file_exists (file) || !is_distance_matrix_file (file) ||!is_similarity_matrix_file (file) )
5392 fp=vfopen (file, "r");
5393 while ((buf2=vfgets (buf1,fp))!=NULL )
5395 if (strstr (buf2, "SEQ_INDEX"))
5398 sscanf (buf1, "# SEQ_INDEX %s %d",name, &i);
5399 if ( !strm (A->name[i], name))
5410 A=similarities_file2aln(file);
5414 sim=declare_int ( A->nseq, A->nseq);
5415 for ( a=0; a<A->nseq; a++)sim[a][a]=100;
5418 fp=find_token_in_file (file, NULL, "PW_SEQ_DISTANCES");
5419 fp=find_token_in_file (file, fp, "BOT");
5420 while ((buf2=vfgets (buf1,fp))!=NULL )
5422 if ( !(strstr (buf2, "BOT\t") || strstr (buf2, "TOP\t")))continue;
5424 n=sscanf (buf1, "%*s %d %d %f", &a, &b, &score);
5430 else sim[a][b]=sim[b][a]=(int)score;
5434 if (new_aln)free_aln(A);
5438 Alignment * similarities_file2aln ( char *file)
5446 fp=vfopen (file, "r");
5447 while ((fp=find_token_in_file (file,fp, "SEQ_INDEX")))nseq++;
5448 A=declare_aln2 (nseq+1, 10);
5450 while ((fp=find_token_in_file (file,fp, "SEQ_INDEX")))
5452 fscanf (fp, "%s %d", name,&i);
5453 sprintf ( A->name[i], "%s", name);
5460 void output_similarities (char *file, Alignment *A, char *mode)
5470 for (max=0, a=0; a< A->nseq; a++)max=MAX(max,(strlen (A->name[a])));
5473 tot=vcalloc ( A->nseq, sizeof (float));
5474 fp=vfopen (file, "w");
5475 fprintf (fp, "# TC_SIMILARITY_MATRIX_FORMAT_01\n");
5476 for ( a=0; a<A->nseq; a++)
5477 fprintf ( fp, "# SEQ_INDEX %s %d\n",A->name[a],a);
5478 fprintf ( fp, "# PW_SEQ_DISTANCES \n");
5479 for (n=0,a=0;a< A->nseq-1; a++)
5481 for ( b=a+1; b<A->nseq; b++, n++)
5483 if (strstr (mode, "_sarmat2"))
5485 s=get_sar_sim (A->seq_al[a], A->seq_al[b]);
5487 else if (strstr (mode, "_sar"))
5489 s=get_sar_sim (A->seq_al[a], A->seq_al[b]);
5491 else if ( (p=strstr (mode, "_memory_")))
5494 sscanf ( p, "_memory_%ld", (long int*)&sim);
5497 else if ( strstr (mode, "_idscore") || strstr ( mode, "_covscore"))
5502 free_sequence (S, -1);
5503 if ( strstr (mode, "idscoreDNA"))
5504 M=read_matrice ("idmat");
5506 M=read_matrice("blosum62mt");
5510 if ( strstr (mode, "_idscore"))s=idscore_pairseq(S->seq[a], S->seq[b], -10,-1, M, "sim");
5511 else s=idscore_pairseq(S->seq[a], S->seq[b], -10,-1, M, "cov");
5513 else if ( strstr (mode, "cov"))
5515 s=get_seq_sim ( A->seq_al[a], A->seq_al[b],GAP_LIST, "cov");
5519 s=get_seq_fsim2 (A->seq_al[a], A->seq_al[b],GAP_LIST, mode);
5521 fprintf (fp, "BOT\t %4d %4d\t %5.2f %*s\t %*s\t %5.2f\n", a,b,s,max,A->name[a], max, A->name[b], s);
5522 fprintf (fp, "TOP\t %4d %4d\t %5.2f %*s\t %*s\t %5.2f\n", b,a,s,max,A->name[b], max, A->name[a], s);
5528 for ( a=0; a< A->nseq; a++)
5530 fprintf (fp, "AVG\t %d\t %*s\t %*s\t %5.2f\n", a,max,A->name[a], max, "*", tot[a]/(A->nseq-1));
5533 vfree (tot);free_int (M, -1);
5534 fprintf (fp, "TOT\t %*s\t %*s\t %5.2f\n", max,"TOT", max, "*", bigtot/n);
5538 void output_similarities_pw (char *file, Alignment *A, Alignment *B,char *mode)
5550 if ( strstr (mode, "idscoreDNA"))
5551 M=read_matrice ("idmat");
5553 M=read_matrice("blosum62mt");
5558 for (max=0, a=0; a< A->nseq; a++)max=MAX(max,(strlen (A->name[a])));
5559 for (a=0; a< B->nseq; a++)max=MAX(max,(strlen (B->name[a])));
5562 tot=vcalloc ( A->nseq, sizeof (float));
5563 fp=vfopen (file, "w");
5564 fprintf (fp, "# TC_SIMILARITY_MATRIX_FORMAT_01\n");
5565 for ( a=0; a<A->nseq; a++)
5566 fprintf ( fp, "# SEQ_INDEX %s %d\n",A->name[a],a);
5567 fprintf ( fp, "# PW_SEQ_DISTANCES \n");
5568 for (n=0,a=0;a< A->nseq; a++)
5570 for ( b=0; b<B->nseq; b++, n++)
5572 s=idscore_pairseq(SA->seq[a], SB->seq[b], -10,-1, M, "sim");
5573 fprintf (fp, "BOT\t %4d %4d\t %5.2f %*s\t %*s\t %5.2f\n", a,b,s,max,A->name[a], max, B->name[b], s);
5574 fprintf (fp, "TOP\t %4d %4d\t %5.2f %*s\t %*s\t %5.2f\n", b,a,s,max,B->name[b], max, A->name[a], s);
5581 for ( a=0; a< A->nseq; a++)
5583 fprintf (fp, "AVG\t %d\t %*s\t %*s\t %5.2f\n", a,max,A->name[a], max, "*", tot[a]/(A->nseq-1));
5585 vfree (tot);free_int (M, -1);
5586 fprintf (fp, "TOT\t %*s\t %*s\t %5.2f\n", max,"TOT", max, "*", bigtot/n);
5589 void output_conservation_statistics ( char *file, Alignment *A)
5596 sprintf (aa, "%s", BLAST_AA_ALPHABET);
5599 tot=declare_double (256, 256);
5602 for ( a=0; a<A->nseq; a+=2)
5605 for ( c=0; c<A->len_aln; c++)
5607 c1=tolower (A->seq_al[a][c]);
5608 c2=tolower (A->seq_al[b][c]);
5609 if ( !is_gap(c1) && !is_gap(c2))
5620 fprintf ( stdout, "# BLAST_MATRIX FORMAT\n#ALPHABET=%s\n",aa);
5621 for (a=0; a<naa; a++)fprintf ( stdout, "%3c ", toupper(aa[a]));
5622 fprintf ( stdout, "\n");
5623 for (a=0; a< naa; a++)
5625 fprintf (stdout, "%c", toupper(aa[a]));
5626 for ( b=0; b< naa; b++)
5628 float f1, f2, f3, r, v;
5629 c1=tolower(aa[a]);c2=tolower(aa[b]);
5630 f1=(float)((tot[c1][c2]*2)/tot[0][0]);
5631 f2=(float)((tot[c1][0])/tot[0][0]);
5632 f3=(float)((tot[c2][0])/tot[0][0]);
5633 r=(float)(f2==0 || f3==0)?0:(f1/(f2*f3));
5634 v=(r==0)?0:((float)10*log((double)r));
5635 fprintf (stdout, " %5d",(int)v);
5637 fprintf ( stdout, "\n");
5640 void output_statistics (char *file, Alignment *A, char *mode)
5643 int a, b, c, d=0, n;
5647 if (!mode || !mode[0])
5649 else if ( mode[0]=='_')
5651 for ( a=0; a<A->nseq; a++)maxname=MAX(strlen(A->name[a]), maxname);
5655 fp=vfopen (file, "w");
5660 while ((c=mode[b++])!='\0')
5662 if ( c=='n') fprintf (fp, "%-*s ",maxname,"name");
5663 if ( c=='l') fprintf (fp, "%-*s ",5,"nres");
5664 if ( c=='g') fprintf (fp, "%-*s ",5,"ngap");
5665 if ( c=='t') fprintf (fp, "%-*s ",5,"len");
5667 if (is_in_set ( c, "nlgt")) fprintf (fp, "\n");
5671 while ((c=mode[b++])!='\0')
5674 if ( c=='N'){d=1;fprintf (fp, "NSEQ %d ", A->nseq);}
5675 if ( c=='L'){d=1;fprintf (fp, "LEN %d ", A->len_aln);}
5677 if ( d) fprintf (fp, "\n");
5679 for (a=0; a<A->nseq; a++)
5683 while ((c=mode[b++])!='\0')
5685 if (is_in_set ( c, "nlgt"))d=1;
5687 if (c=='n'){d=1;fprintf ( fp, "%-*s ", maxname,A->name[a]);}
5690 for (n=0,d=0; d<A->len_aln; d++)n+=!is_gap(A->seq_al[a][d]);
5691 fprintf ( fp, "%-5d ",n);
5695 for (n=0,d=0; d<A->len_aln; d++)n+=((is_gap(A->seq_al[a][d]) && !is_gap(A->seq_al[a][d+1]))||(is_gap(A->seq_al[a][d])&& A->seq_al[a][d+1]=='\0')) ;
5696 fprintf ( fp, "%-5d ",n);
5700 fprintf ( fp, "%-5d ",(int)strlen (A->seq_al[a]));
5704 fprintf ( fp, "%-5d ",A->nseq);
5708 fprintf ( fp, "%-5d ",A->len_aln);
5711 if (d)fprintf ( fp, "\n");
5716 int output_age_matrix ( char *outfile, int val)
5720 char alp[]="abcdefghij-";
5723 mat=declare_int ( 256, 256);
5725 for ( a=0; a<naa; a++)
5726 for ( b=0; b<naa; b++)
5728 if (is_gap(alp[a]) ||is_gap(alp[b] ))mat[(int)alp[a]][(int)alp[b]]=((val==0)?1:val)*-1;
5729 else mat[(int)alp[a]][(int)alp[b]]=(FABS((a-b))*-1)*((val==0)?1:val);
5732 output_mat ( mat,outfile, alp, 0);
5733 free_arrayN((void**)mat, 2);
5740 int output_transitions(char *outfile, Alignment *A)
5742 double table[256][256];
5743 double symbols[256];
5744 double tot, l, freq, expected, log_odd;
5754 for ( a=0; a< 256; a++)
5755 for (b=0; b<256; b++)
5760 alp=vcalloc ( 256, sizeof (char));
5761 mat=declare_int ( 256,256);
5762 fmat=declare_float ( 256,256);
5764 for (tot=0,a=0; a< A->nseq; a++)
5766 ungap (A->seq_al[a]);
5767 lower_string (A->seq_al[a]);
5770 if ( s[0]=='\0') continue;
5771 symbols[(int)s[0]]++;
5772 for ( b=1; b< l; b++)
5774 symbols[(int)s[b]]++;
5775 table[(int)s[b-1]][(int)s[b]]++;
5779 for (naa=0, a=0; a< 256; a++)
5781 if (symbols[a])alp[naa++]=a;
5785 for ( a=0; a< 256; a++)
5786 for (b=0; b<256; b++)
5788 if (symbols[a]&& symbols[b] && table[a][b] && tot>0)
5790 freq=(table[a][b])/tot;
5791 expected=(symbols[a]*symbols[b])/(tot*tot);
5792 log_odd=log (freq/expected);
5793 mat[a-'A'][b-'A']=log_odd*10;
5794 fmat[a-'A'][b-'A']=log_odd;
5796 else if ( symbols[a]&& symbols[b])
5798 mat[a-'A'][b-'A']=-999;
5799 fmat[a-'A'][b-'A']=-999;
5802 output_mat ( mat,outfile, alp, 'A');
5804 fp=vfopen (outfile, "a");
5805 for ( a=0; a<256; a++)
5808 fprintf (fp, "# %c tot: %6d freq: %7.5f\n", a, (int)symbols[a],(float)symbols[a]/tot);
5811 for ( a=0; a< 256; a++)
5812 for (b=0; b<256; b++)
5814 if (symbols[a]&& symbols[b])
5816 freq=(table[a][b])/tot;
5817 fprintf (fp, "# %c%c tot: %6d freq: %7.5f log_odd: %9.3f\n", a, b, (int)table[a][b],(float)freq,fmat[a-'A'][b-'A']);
5822 free_arrayN ((void **)mat, 2);
5823 free_arrayN ((void **)fmat, 2);
5830 void output_est_prf (char *fname, Alignment *A)
5837 fprintf ( stderr, "\nFormat output_est_prf Impossible: No profile\n");
5838 myexit(EXIT_FAILURE);
5842 fp=vfopen ( fname, "w");
5843 fprintf ( fp, "Consensus Sequence\nReconstructed with %s (%s,%s)\n",PROGRAM,AUTHOR,DATE);
5844 fprintf ( fp, "%4c %4c %4c %4c %15s Consensus\n", 'A','G','C','T', "Internal Gaps");
5846 for ( a=0; a< A->len_aln; a++)
5848 fprintf (fp, "%4d %4d %4d %4d %15d %c\n", (A->P)->count[0][a],(A->P)->count[1][a],(A->P)->count[2][a], (A->P)->count[3][a], (A->P)->count[4][a],A->seq_al[0][a]);
5854 void output_gotoh_seq (char *fname, Alignment*A )
5859 fp=vfopen ( fname, "w");
5860 fprintf ( fp, "%d %d\n",A->nseq, A->max_len);
5861 for ( a=0; a< A->nseq; a++)
5863 ungap ( A->seq_al[a]);
5864 fprintf ( fp, ">%s\n", A->name[a]);
5865 fp=output_string_wrap ( 50,A->seq_al[a] , fp);
5866 fprintf ( fp, "//\n");
5872 void output_mult_fasta_seq (char *fname, Alignment*A, int n )
5877 fp=vfopen (fname, "w");
5878 ungap(A->seq_al[0]);
5881 fprintf (fp, ">%s_%d\n%s\n", A->name[0],a+1, A->seq_al[0]);
5885 int output_wexons (char *name, Alignment *A)
5890 fp=vfopen (name, "w");
5892 if (!A) {vfclose(fp);return 0;}
5894 if (!w) {vfclose (fp);return 0; }
5896 for (a=0; a<A->nseq; a++)
5898 fprintf (fp, ">%s\n", A->name[a]);
5899 for (c=0,b=0; b<A->len_aln; b++)
5903 if (!is_gap(r) && r!='f' && r!='F')
5905 fprintf (fp, " %c %d ", r,w[a][c++]);
5907 else if (!is_gap(r))fprintf (fp,"%c -1 ",r);
5913 char * output_fasta_seqX (char *name, char *mode, Sequence *S, Alignment *A, int i)
5917 if (!name)name=vtmpnam (NULL);
5918 fp=vfopen (name, mode);
5919 if ( (S && S->nseq<=i) || (A && S->nseq<=i) || (!A && !S))
5921 fprintf ( stderr, "\nERROR in function reformat:output_fasta_seqX[FATAL:%s]", PROGRAM);
5922 myexit (EXIT_FAILURE);
5926 fprintf ( fp, ">%s %s\n%s\n", S->name[i], S->seq_comment[i], S->seq[i]);
5929 ungap (A->seq_al[i]);
5930 fprintf ( fp, ">%s %s\n%s\n", A->name[i], A->seq_comment[i], A->seq_al[i]);
5936 void output_fasta_seq1 (char *fname, Alignment*A )
5938 char seq_name[VERY_LONG_STRING];
5943 for ( a=0; a< A->nseq; a++)
5945 if ( strncmp( fname, "name",4)==0)
5947 if ( (fname+4)[0]!='\0')extension=fname+5;
5951 sprintf ( seq_name,"%s.%s", A->name[a],(extension==NULL)?"seq":extension);
5954 sprintf ( seq_name,"%s.seq",A->name[a]);
5956 ungap ( A->seq_al[a]);
5957 fp=vfopen (seq_name, "w");
5958 fprintf (fp, ">%s %s\n", A->name[a], A->seq_comment[a]);
5959 fp=output_string_wrap ( 50, A->seq_al[a],fp);
5960 fprintf ( fp, "\n");
5964 void output_pir_check (char *fname,int nseq, char **comment )
5969 if ( fname==NULL)return;
5970 fp=vfopen ( fname, "w");
5972 for ( a=0; a< nseq; a++)fprintf (fp, "%s\n", comment[a]);
5975 void output_fasta_seqS (char *fname, Sequence *S)
5978 A=seq2aln (S,NULL,RM_GAP);
5979 output_fasta_seq (fname, A);
5983 void output_fasta_seq (char *fname, Alignment*A)
5985 main_output_fasta_seq (fname, A, HEADER);
5987 void output_fasta_tree (char *fname, Alignment*A)
5991 if ( !A || !A->nseq) return;
5993 fp=vfopen ( fname, "w");
5995 for ( a=0; a< A->nseq; a++)
5997 fprintf ( fp, ">%s %s\n%s\n", A->name[a], A->seq_comment[a], A->seq_al[a]);
6001 void main_output_fasta_seq (char *fname, Alignment*A,int header )
6006 fp=vfopen ( fname, "w");
6008 for ( a=0; a< A->nseq; a++)
6010 ungap(A->seq_al[a]);
6011 fprintf ( fp, ">%s", A->name[a]);
6012 if (header==HEADER && A->seq_comment[a][0] && !isblanc(A->seq_comment[a]))fprintf (fp," %s\n",A->seq_comment[a]);
6013 else fprintf ( fp, "\n");
6014 fp=output_string_wrap ( 50, A->seq_al[a],fp);
6015 fprintf ( fp, "\n");
6019 void output_gor_seq (char *fname, Alignment*A )
6024 fp=vfopen ( fname, "w");
6026 for ( a=0; a< A->nseq; a++)
6028 ungap(A->seq_al[a]);
6029 fprintf ( fp, "!%s %d \n", A->name[a], (int)strlen(A->seq_al[a]));
6030 upper_string ( A->seq_al[a]);
6031 fp=output_string_wrap ( 50, A->seq_al[a],fp);
6032 fprintf ( fp, "@\n");
6036 void output_pir_seq (char *fname, Alignment*A )
6039 for ( a=0; a< A->nseq; a++)ungap(A->seq_al[a]);
6040 output_pir_aln (fname, A);
6042 void output_pir_seq1 (char *fname, Alignment*A )
6044 char seq_name[VERY_LONG_STRING];
6050 for ( a=0; a< A->nseq; a++)
6052 if ( strm ( get_string_type (A->seq_al[a]),"DNA") || strm ( get_string_type (A->seq_al[a]),"RNA"))sprintf(type, "DL");
6053 else if ( strm ( get_string_type (A->seq_al[a]),"PROTEIN"))sprintf(type, "P1");
6054 sprintf ( seq_name,"%s;%s_%s.seq",type, fname,A->name[a]);
6055 ungap ( A->seq_al[a]);
6056 fp=vfopen (seq_name, "w");
6057 fprintf (fp, ">%s\n\n", A->name[a]);
6058 fp=output_string_wrap ( 50, A->seq_al[a],fp);
6059 fprintf ( fp, "\n*\n");
6063 /*******************************************************************************************/
6068 /***************************************************************************************** */
6069 void output_mocca_aln (char *outfile, Alignment *A, Alignment *S)
6073 char **new_name_order;
6076 score=declare_int (S->nseq, 2);
6077 new_name_order=declare_char ( S->nseq,MAXNAMES+1);
6078 for ( a=0; a<A->nseq; a++)
6081 score[a][1]=S->score_seq[a];
6083 sort_int_inv (score+1,2,1,0,S->nseq-2);
6084 for ( a=0; a<A->nseq; a++)
6086 sprintf ( new_name_order[a], "%s", A->name[score[a][0]]);
6088 A=reorder_aln (A, new_name_order, A->nseq);
6090 fp=vfopen (outfile, "w");
6091 fprintf ( fp, "MOCCA,(%s,%s, C. Notredame)\nSCORE %d\nNSEQ %d\nLEN %d\n",VERSION,DATE, A->score_aln, A->nseq, A->len_aln);
6093 maxl=return_maxlen ( new_name_order, A->nseq);
6096 for (a=0; a< A->nseq; a++)
6098 fprintf (fp, "%-*s: %3d\n", maxl, A->name[a], score[a][1]);
6101 fprintf ( fp, "\n");
6103 fp=output_Alignment_without_header ( A, fp);
6105 free_int (score, -1);
6106 free_char (new_name_order, -1);
6110 void print_sub_aln ( Alignment *B, int *ns, int **ls)
6116 X=copy_aln (B, NULL);
6118 X->len_aln=strlen ( B->seq_al[ls[0][0]]);
6121 for (a=0; a< 2; a++)
6122 for ( b=0; b<ns[a]; b++, X->nseq++)
6124 sprintf ( X->seq_al[X->nseq], "%s", B->seq_al[ls[a][b]]);
6125 sprintf ( X->name[X->nseq], "%s", B->name[ls[a][b]]);
6127 X->name[X->nseq][0]='\0';
6132 void print_aln ( Alignment *B)
6137 output_Alignment_without_header ( B, stderr);
6143 FILE * output_aln ( Alignment *B, FILE *fp){return output_Alignment(B, fp);}
6144 FILE * output_Alignment ( Alignment *B, FILE *fp)
6146 fprintf ( fp, "%s, %s (%s) [%s] [MODE: %s]\n%s\nCPU %d sec\nSCORE %d\nNSEQ %d\nLEN %d\n",PROGRAM,VERSION,DATE,retrieve_mode(),URL,AUTHOR, (B->cpu+get_time())/1000, B->score_aln, B->nseq, B->len_aln);
6148 return output_Alignment_without_header ( B, fp);
6151 FILE * output_Alignment_without_header ( Alignment *B, FILE *fp)
6160 if (fp==NULL)return fp;
6161 for ( a=0; a< B->nseq; a++)
6162 {if ( strlen (B->name[a])>max_len)
6163 max_len= strlen ( (B->name[a]));
6165 max_len=MAX(max_len+2, 16);
6166 line=get_msa_line_length (0, 0);
6167 n_residues=vcalloc ( B->nseq+1, sizeof (int));
6168 for ( a=0; a<B->nseq; a++)n_residues[a]=(B->output_res_num==2)?B->order[a][1]:0;
6173 fprintf ( fp, "\n");
6174 for (a=0; a<B->len_aln; a+=line)
6175 {for (b=0; b<=B->nseq; b++)
6177 fprintf (fp,"%-*s",max_len,B->name[b]);
6178 if (B->output_res_num)fprintf (fp, " %4d ", n_residues[b]+1);
6179 for (c=a;c<a+line && c<B->len_aln;c++)
6181 if (b==B->nseq){n_residues[b]++;s=analyse_aln_column ( B, c);}
6183 {n_residues[b]+=!is_gap(B->seq_al[b][c]);
6184 s=GET_CASE(B->residue_case, B->seq_al[b][c]);
6187 fprintf (fp,"%c",s );
6189 if (B->output_res_num)fprintf (fp, " %4d", n_residues[b]);
6196 fprintf (fp,"\n\n");
6201 FILE * output_aln_score ( Alignment *B, FILE *fp){return output_Alignment_score(B, fp);}
6202 FILE * output_Alignment_score ( Alignment *B, FILE *fp)
6205 static int max_len=0;
6209 if (fp==NULL)return fp;
6212 for ( a=0; a< B->nseq; a++)
6213 {if ( strlen (B->name[a])>max_len)
6214 max_len= strlen ( (B->name[a]));
6219 line=get_msa_line_length(0, 0);
6220 sprintf (B->name[B->nseq], "CONS");
6221 fprintf ( fp, "T_COFFEE ALIGNMENT\nCPU TIME:%d sec.\n", (B->cpu+get_time())/1000);
6222 fprintf ( fp, "SCORE=%d\n", B->score_aln);
6223 for ( a=0;a<B->nseq; a++)fprintf ( fp, "%s: %d\n", B->name[a], B->score_seq[a]);
6224 fprintf ( fp, "\n");
6225 for (a=0; a<B->len_aln; a+=line)
6226 {for (b=0; b<B->nseq; b++)
6228 fprintf (fp,"%-*s",max_len,B->name[b]);
6229 for (c=a;c<a+line && c<B->len_aln;c++)
6232 if (ch==NO_COLOR_RESIDUE)fprintf (fp,"-");
6233 else if ( ch==NO_COLOR_GAP)fprintf (fp,"*");
6234 else if ( ch<10 && ch>=0)fprintf (fp,"%d",ch);
6235 else if ( ch>10)fprintf (fp,"#");
6236 else if ( ch<0)fprintf (fp,".");
6237 else fprintf (fp,"9");
6242 fprintf (fp,"%-*s",max_len,B->name[b]);
6243 for (c=a;c<a+line && c<B->len_aln;c++)
6246 if (ch==NO_COLOR_RESIDUE)fprintf (fp,"-");
6247 else if ( ch==NO_COLOR_GAP)fprintf ( fp, "*");
6248 else if ( ch<10 && ch>=0)fprintf (fp,"%d",ch);
6249 else if ( ch>10)fprintf (fp,"#");
6250 else if ( ch<0)fprintf (fp,".");
6251 else fprintf (fp,"9");
6253 fprintf (fp,"\n\n\n");
6255 fprintf (fp,"\n\n");
6258 FILE * output_aln_with_res_number ( Alignment *B, FILE *fp){return output_Alignment_with_res_number(B, fp);}
6259 FILE * output_Alignment_with_res_number ( Alignment *B, FILE *fp)
6262 static int max_len=0;
6266 if (fp==NULL)return fp;
6269 for ( a=0; a< B->nseq; a++)
6270 {if ( strlen (B->name[a])>max_len)
6271 max_len= strlen ( (B->name[a]));
6276 order=copy_int ( B->order,declare_int ( B->nseq, 2), B->nseq, 2);
6278 fprintf ( fp, "T_COFFEE ALIGNMENT\nCPU TIME:%d sec.\n", (B->cpu+get_time())/1000);
6279 fprintf ( fp, "\n");
6280 for (a=0; a<B->len_aln; a+=line)
6281 {for (b=0; b<B->nseq; b++)
6283 fprintf (fp,"%-*s %3d %4d ",max_len,B->name[b], order[b][0], order[b][1] );
6284 for (c=a;c<a+line && c<B->len_aln;c++)
6286 order[b][1]+=1-is_gap(B->seq_al[b][c]);
6287 fprintf (fp,"%c",toupper(B->seq_al[b][c]) );
6289 fprintf (fp," %4d\n", order[b][1] );
6293 fprintf (fp,"\n\n");
6295 free_int (order, -1);
6299 void output_constraints ( char *fname, char *mode,Alignment *A)
6302 Constraint_list *CL;
6306 if ( !A->CL || strm ( mode, "pdb"))
6313 CL=declare_constraint_list ( A->S, NULL, NULL, 0, NULL, NULL);
6314 CL=aln2constraint_list (A,CL, mode);
6315 fp=save_constraint_list ( CL, 0, CL->ne,fname, NULL, "lib",A->S);
6317 free_constraint_list (CL);
6320 else if ( strncmp ( mode, "extended_pair", 13)==0)
6322 buf=duplicate_string (mode+14);
6324 name_list=vcalloc(2, sizeof(char*));
6325 name_list[0]=strtok (buf,"_");
6326 name_list[1]=strtok (NULL,"_");
6331 fp=save_sub_list_header (vfopen(fname, "w"),2, name_list,CL);
6332 fp=save_extended_constraint_list_pair (CL, "pair",name_list[0],name_list[1],fp);
6333 fp=save_list_footer (fp, CL);
6336 else if ( strm2 (mode, "extended_lib","extended_cosmetic"))
6339 fp=save_extended_constraint_list ( CL,mode+9, vfopen(fname, "w"));
6343 CL=(Constraint_list *)A->CL;
6344 fp=save_constraint_list ( CL, 0, CL->ne,fname, NULL, "lib",A->S);
6348 if ( (Constraint_list *)A->CL !=CL)free_constraint_list (CL);
6353 void output_model_aln (char *fname, Alignment*A )
6361 if ( A->Dp_result==NULL)
6363 fprintf ( stderr, "\nWARNING Could Not Output Model %s [%s]", fname, PROGRAM);
6368 fp=vfopen ( fname, "w");
6369 for (a=0; a<M->nstate; a++)
6371 if (M->model_comments[a][0])fprintf ( fp, "#STATE %c: %s\n", 'a'+a, M->model_comments[a]);
6373 string=vcalloc ( R->len+1, sizeof (char));
6374 for (a=0; a<R->len; a++)string[a]=R->traceback[a]+'a';
6375 fprintf ( fp, ">%s\n",fname);
6376 fp=output_string_wrap ( 50,string, fp);
6378 fprintf ( fp, "\n");
6383 char * output_fasta_sub_aln (char *fname, Alignment*A, int ns, int *ls )
6387 if (fname==NULL)fname=vtmpnam (NULL);
6388 fp=vfopen (fname, "w");
6389 for (a=0; a<ns; a++)
6392 fprintf (fp, ">%s %s\n%s\n", A->name[s],A->seq_comment[s],A->seq_al[s]);
6397 char * output_fasta_sub_aln2 (char *fname, Alignment*A, int *ns, int **ls )
6401 if (fname==NULL)fname=vtmpnam (NULL);
6402 fp=vfopen (fname, "w");
6403 for ( g=0; g<2; g++)
6404 for (a=0; a<ns[g]; a++)
6407 fprintf (fp, ">%s %s\n%s\n", A->name[s],A->seq_comment[s],A->seq_al[s]);
6413 int output_suchard_aln (char *out_file, Alignment *A)
6418 A=back_translate_dna_aln (A);
6420 for ( c=0,a=0; a<A->len_aln; a++, c++)
6423 for (b=0; b<A->nseq; b++)
6427 A->seq_al[b][a]='-';
6431 A=ungap_aln_n (A, 1);
6432 fp=vfopen (out_file, "w");
6433 for ( a=0; a< A->nseq; a++)
6435 for (b=0; b< A->len_aln; b++)
6437 c=tolower(A->seq_al[a][b]);
6439 else if ( c=='g')d=2;
6440 else if ( c=='c')d=3;
6441 else if ( c=='t')d=4;
6442 else if ( c=='u')d=5;
6445 fprintf ( fp, "%d", d);
6447 fprintf ( fp, "\n");
6450 myexit (EXIT_SUCCESS);
6453 void output_fasta_aln (char *fname, Alignment*A )
6459 line=get_msa_line_length (line, A->len_aln+1);
6460 fp=vfopen ( fname, "w");
6462 for ( a=0; a< A->nseq; a++)
6464 fprintf ( fp, ">%s", A->name[a]);
6466 if ( A->seq_comment[a][0] && !isblanc (A->seq_comment[a]))fprintf ( fp, " %s", A->seq_comment[a]);
6467 fprintf ( fp, "\n");
6468 fp=output_string_wrap ( line,A->seq_al[a] , fp);
6469 fprintf ( fp, "\n");
6474 void output_pir_aln (char *fname, Alignment*A )
6484 fp=vfopen ( fname, "w");
6485 for ( a=0; a< A->nseq; a++)
6487 if ( strm ( get_string_type (A->seq_al[a]),"DNA") || strm ( get_string_type (A->seq_al[a]),"RNA"))sprintf(type, "DL");
6488 else if ( strm ( get_string_type (A->seq_al[a]),"PROTEIN"))sprintf(type, "P1");
6489 fprintf ( fp, ">%s;%s\n%s\n",type, A->name[a], A->seq_comment[a]);
6490 fp=output_string_wrap ( 50,A->seq_al[a] , fp);
6491 fprintf ( fp, "\n*\n");
6498 int set_landscape_msa (int len)
6500 if ( len==0)landscape_msa=-1;
6505 return landscape_msa;
6507 int get_msa_line_length (int line, int aln_len)
6509 if (landscape_msa==-1) return aln_len;
6510 else if ( landscape_msa)return landscape_msa;
6511 else if (line) return line;
6514 return (getenv ("ALN_LINE_LENGTH"))?atoi(getenv("ALN_LINE_LENGTH")):ALN_LINE_LENGTH;
6518 void output_msf_aln (char *fname,Alignment *B)
6524 long grand_checksum;
6533 line=get_msa_line_length (line, B->len_aln+1);
6536 for ( max_len=0,a=0; a< B->nseq; a++)max_len= MAX(strlen ( B->name[a]),max_len);
6541 fp=vfopen (fname, "w");
6543 seq =vcalloc(B->len_aln, sizeof(char));
6544 all_checks =vcalloc(B->nseq, sizeof(int));
6545 for ( i=0; i< B->nseq; i++)
6547 for ( j=0; j<B->len_aln; j++)
6549 if ( is_gap(B->seq_al[i][j]))seq[j]='.';
6550 else seq[j]=B->seq_al[i][j]=toupper(B->seq_al[i][j]);
6553 all_checks[i] = SeqGCGCheckSum(seq, (int)B->len_aln);
6556 for(i=0; i<B->nseq; i++) grand_checksum += all_checks[i];
6557 grand_checksum = grand_checksum % 10000;
6558 fprintf(fp,"PileUp\n\n");
6560 fprintf(fp,"\n\n MSF:%5d Type: ",B->len_aln);
6561 if(strm ( (B->S)->type, "DNA") || strm ( (B->S)->type, "RNA"))
6565 fprintf(fp," Check:%6ld .. \n\n", (long)grand_checksum);
6566 for (i=0; i< B->nseq; i++)
6568 fprintf ( fp, " Name: %s oo Len:%5d Check:%6ld Weight: %.3f\n", B->name[i], B->len_aln,(long)all_checks[i],(B->S)->W?((B->S)->W)->SEQ_W[i]:1.00);
6570 fprintf(fp,"\n//\n\n");
6572 for (a=0; a<B->len_aln; a+=line)
6574 fprintf ( fp,"\n\n");
6575 for (b=0; b<B->nseq; b++)
6577 fprintf (fp,"%-*s ",max_len,B->name[b]);
6578 for (c_block=0,c=a;c<a+line && c<B->len_aln;c++)
6580 if ( c_block==block)
6586 aa=(is_gap(B->seq_al[b][c]))?'.': toupper(B->seq_al[b][c]);
6587 fprintf (fp,"%c",aa );
6589 if ( c_block==block)
6608 int SeqGCGCheckSum(char *seq, int len)
6613 for( i=0, check=0; i< len; i++,seq++)
6614 check += ((i % 57)+1) * toupper(*seq);
6616 return(check % 10000);
6618 void old_output_msf_aln (char *fname,Alignment *B)
6621 static int *put_seq;
6629 line=get_msa_line_length (line, B->len_aln+1);
6632 for ( max_len=0,a=0; a< B->nseq; a++)max_len= MAX(strlen ( B->name[a]),max_len);
6633 for ( seq_max_len=0,a=0; a< B->nseq; a++)seq_max_len= MAX(strlen ( B->seq_al[a]),max_len);
6636 buf=vcalloc(seq_max_len+1, sizeof (int));
6639 put_seq= vcalloc ( B->nseq, sizeof (int));
6643 for ( b=1; b< B->nseq; b++)
6645 sprintf ( buf, "%s", B->seq_al[b]);
6647 put_seq[b]=( strlen (buf)>0)?1:0;
6650 fp=vfopen ( fname, "w");
6651 fprintf ( fp, "MSF: %d Type P Check: 5083 ..\n", B->len_aln);
6652 for ( a=0; a< B->nseq; a++)
6655 fprintf ( fp,"Name: %s\n",B->name[a]);
6657 fprintf ( fp, "//\n");
6658 for (a=0; a<B->len_aln; a+=line)
6659 {for (b=0; b<B->nseq; b++)
6663 fprintf (fp,"%-*s ",max_len,B->name[b]);
6664 for (c=a;c<a+line && c<B->len_aln;c++)
6669 aa=(B->seq_al[b][c]=='-')?'.': toupper(B->seq_al[b][c]);
6670 fprintf (fp,"%c",aa );
6677 fprintf ( fp,"\n\n");
6684 void output_saga_aln ( char *name, Alignment *B)
6694 line=get_msa_line_length (line, B->len_aln+1);
6698 for ( max_len=0,a=0; a< B->nseq; a++)max_len= (strlen ( B->name[a])>max_len)?(strlen ( B->name[a])):max_len;
6703 fp= vfopen ( name, "w");
6705 fprintf (fp, "\nSAGA FORMAT\nalignement %s nseq=%d len=%d\n", name, B->nseq, B->len_aln);
6707 fprintf (fp, "\n\n");
6708 for (a=0; a<B->len_aln; a+=line)
6709 {for (b=0; b<B->nseq; b++)
6710 {fprintf (fp,"%-*s ",max_len,B->name[b]);
6711 for (c=a;c<a+line && c<B->len_aln;c++)
6713 fprintf (fp,"%c",(B->seq_al[b][c]) );
6719 fprintf (fp,"\n\n");
6722 void output_compact_aln ( char *name, Alignment *B)
6732 line=get_msa_line_length (line, B->len_aln+1);
6735 for ( max_len=0,a=0; a< B->nseq; a++)max_len= (strlen ( B->name[a])>max_len)?(strlen ( B->name[a])):max_len;
6740 fp= vfopen ( name, "w");
6742 fprintf (fp, "\nSAGA FORMAT\nalignement %s nseq=%d len=%d", name, B->nseq, B->len_aln);
6743 fprintf (fp, "\n\n");
6744 for (a=0; a<B->len_aln; a+=line)
6745 {for (b=0; b<B->nseq; b++)
6748 for ( do_print=0, c=a;c<a+line && c<B->len_aln;c++)
6749 do_print+=1-is_gap(B->seq_al[b][c]);
6752 fprintf (fp,"%-*s ",max_len,B->name[b]);
6756 for (c=a;c<a+line && c<B->len_aln;c++)
6758 if ( is_gap(B->seq_al[b][c])&& B->seq_al[b][c]!='-' )fprintf (fp,"%c", '-');
6759 else fprintf (fp,"%c",(B->seq_al[b][c]) );
6766 fprintf (fp,"\n\n");
6770 void output_clustal_aln ( char *name, Alignment *B)
6772 return output_generic_clustal_aln (name, B, "tc_clustal");
6774 void output_strict_clustal_aln ( char *name, Alignment *B)
6776 return output_generic_clustal_aln (name, B, "strict_clustal");
6779 void output_generic_clustal_aln ( char *name, Alignment *B, char *mode)
6787 if ( getenv ("SEP_4_TCOFFEE"))
6789 while ( line<B->len_aln && B->seq_al[0][line]!='o' && B->seq_al[0][line]!='O')line++;
6790 if ( B->seq_al[0][line]=='O' || B->seq_al[0][line]=='o')line++;
6794 while ( line<B->len_aln)line++;
6797 if ( line==B->len_aln)line=get_msa_line_length (0, B->len_aln+1);
6799 n_residues=vcalloc ( B->nseq+1, sizeof (int));
6800 for ( a=0; a< B->nseq; a++)
6801 {if ( strlen (B->name[a])>max_len)
6802 max_len= strlen ( (B->name[a]));
6803 n_residues[a]=B->order[a][1];
6805 max_len=MAX(max_len+2, 16);
6808 fp= vfopen ( name, "w");
6810 if ( strm (mode, "strict_clustal"))
6811 fprintf ( fp, "CLUSTAL W (1.83) multiple sequence alignment");
6814 fprintf (fp, "CLUSTAL FORMAT for %s %s [%s] [MODE: %s ], CPU=%.2f sec, SCORE=%d, Nseq=%d, Len=%d ", PROGRAM, VERSION,URL, retrieve_mode (),(float)(B->cpu+get_time())/1000, B->score_aln, B->nseq, B->len_aln);
6817 float ibit=(float)log ((double)B->ibit)/log ((double)2);
6818 float nibit=(float)log(ibit/(B->len_aln*B->nseq));
6819 fprintf ( fp, "\nTies: %.1f bits (%d alternative)\n",ibit, B->ibit-1);
6823 fprintf (fp, "\n\n");
6828 for (b=0; b<=B->nseq; b++)
6829 fprintf (fp,"%-*s -\n",max_len, B->name[b]);
6834 for (a=0; a<B->len_aln; a+=line)
6835 {for (b=0; b<=B->nseq; b++)
6839 fprintf (fp,"%-*s",max_len, B->name[b]);
6840 for (c=a;c<a+line && c<B->len_aln;c++)
6842 if ( is_gap(B->seq_al[b][c]))fprintf (fp,"%c", '-');
6846 fprintf (fp, "%c", GET_CASE(B->residue_case, B->seq_al[b][c]));
6851 if (B->output_res_num)fprintf (fp, " %d", n_residues[b]);
6854 else if ( b==B->nseq)
6856 fprintf (fp,"%-*s",max_len," ");
6857 for (c=a;c<a+line && c<B->len_aln;c++)
6859 fprintf ( fp, "%c", analyse_aln_column (B, c));
6867 fprintf (fp,"\n\n");
6871 FILE * output_generic_interleaved_aln (FILE *fp, Alignment *B, int line, char gap, char *mode)
6878 n_residues=vcalloc ( B->nseq+1, sizeof (int));
6879 for ( a=0; a< B->nseq; a++)
6880 {if ( strlen (B->name[a])>max_len)
6881 max_len= strlen ( (B->name[a]));
6882 n_residues[a]=B->order[a][1];
6884 max_len=MAX(max_len+2, 16);
6891 for (b=0; b<=B->nseq; b++)
6892 fprintf (fp,"%-*s -\n",max_len, B->name[b]);
6897 for (a=0; a<B->len_aln; a+=line)
6898 {for (b=0; b<=B->nseq; b++)
6902 fprintf (fp,"%-*s",max_len, B->name[b]);
6903 for (c=a;c<a+line && c<B->len_aln;c++)
6905 if ( is_gap(B->seq_al[b][c]))fprintf (fp,"%c", gap);
6909 fprintf (fp, "%c", GET_CASE(B->residue_case, B->seq_al[b][c]));
6914 if (B->output_res_num)fprintf (fp, " %d", n_residues[b]);
6924 void output_phylip_aln ( char *name, Alignment *B)
6931 line=get_msa_line_length(0, 0);
6933 print_name=vcalloc ( B->nseq, sizeof (int));
6934 fp= vfopen ( name, "w");
6936 fprintf (fp, "%3d %d\n", B->nseq, B->len_aln);
6937 for (a=0; a<B->len_aln; a+=line)
6938 {for (b=0; b<B->nseq; b++)
6939 {if ( print_name[b]==0)
6942 fprintf (fp,"%-10.10s ",B->name[b]);
6947 fprintf (fp, "%10.10s ", " ");
6951 for (d=0,c=a;c<a+line && c<B->len_aln;c++, d++)
6958 if ( is_gap(B->seq_al[b][c])&& B->seq_al[b][c]!='-' )fprintf (fp,"%c", '-');
6959 else fprintf (fp,"%c",(B->seq_al[b][c]) );
6965 fprintf (fp,"\n\n");
6969 void output_rnalign (char *out_file, Alignment *A, Sequence *STRUC)
6973 char bank_file[100];
6977 sprintf ( bank_file, "%s.mss", out_file);
6978 sprintf ( pep_file, "%s.one_rna", out_file);
6981 buf=vcalloc ( strlen ( A->seq_al[0]+1), sizeof (char));
6983 for ( b=0,a=0; a< strlen(A->seq_al[0]); a++)
6985 if ( is_gap(A->seq_al[0][a]))
6988 buf[a]=STRUC->seq[0][b++];
6992 fp=vfopen ( bank_file, "w");
6994 fprintf ( fp, "ST\n");
6995 fp=output_string_wrap ( 50, buf, fp);
6996 fprintf ( fp, "\n\n");
6998 for ( a=0; a<A->nseq-1; a++)
7000 fprintf ( fp, "AS %s\n ", A->name[a]);
7001 fp=output_string_wrap ( 50, A->seq_al[a], fp);
7002 fprintf ( fp, "\n\n");
7005 fp=vfopen ( pep_file, "w");
7006 fprintf ( fp, ">%s\n", A->name[A->nseq-1]);
7007 fp=output_string_wrap ( 50, A->seq_al[A->nseq-1], fp);
7008 fprintf ( fp, "\n");
7012 void output_lib (char *pw_lib_saga_aln_name, Alignment *A )
7015 char fname[VERY_LONG_STRING];
7018 B=declare_Alignment (NULL);
7022 for ( a=0; a< A->nseq-1; a++)
7024 for ( b=a+1; b<A->nseq; b++)
7026 sprintf ( B->seq_al[0], "%s", A->seq_al[a]);
7027 sprintf ( B->name[0], "%s", A->name[a]);
7028 sprintf(B->name[1], "%s", A->name[b]);
7029 sprintf ( B->seq_al[1], "%s",A->seq_al[b]);
7031 sprintf ( fname, "%s_%s_%s.lib",pw_lib_saga_aln_name, A->name[a], A->name[b]);
7033 B->len_aln=strlen ( B->seq_al[0]);
7035 output_clustal_aln (fname,B);
7039 void output_pw_lib_saga_aln (char *pw_lib_saga_aln_name, Alignment *A )
7042 char fname[VERY_LONG_STRING];
7045 B=declare_Alignment (NULL);
7049 for ( a=0; a< A->nseq-1; a++)
7051 for ( b=a+1; b<A->nseq; b++)
7053 sprintf ( B->seq_al[0], "%s", A->seq_al[a]);
7054 sprintf ( B->name[0], "%s", A->name[a]);
7055 sprintf(B->name[1], "%s", A->name[b]);
7056 sprintf ( B->seq_al[1], "%s",A->seq_al[b]);
7058 sprintf ( fname, "%s_%s_%s.pw_lib_saga_aln",pw_lib_saga_aln_name, A->name[a], A->name[b]);
7060 B->len_aln=strlen ( B->seq_al[0]);
7062 output_clustal_aln (fname,B);
7066 void output_lalign_header( char *name, Alignment *A)
7070 fp=vfopen ( name, "w");
7071 fprintf ( fp, " Lalign mode: best local alignments between two sequences\n");
7072 fprintf ( fp, " %s(%s) [%s]\n\n", VERSION, DATE, URL);
7073 fprintf ( fp, " Comparison of:\n(A) %s\t%s\t-%d aa\n", (A->S)->file[A->order[0][0]],(A->S)->name[A->order[0][0]], (A->S)->len[A->order[0][0]]);
7074 fprintf ( fp, "(B) %s\t%s\t-%d aa\n", (A->S)->file[A->order[1][0]],(A->S)->name[A->order[1][0]], (A->S)->len[A->order[1][0]]);
7080 void output_stockholm_aln (char *file, Alignment *A, Alignment *ST)
7085 for (a=0; a<A->nseq; a++)
7086 for (b=0; b<A->len_aln; b++)
7087 if (A->seq_al[a][b]==STOCKHOLM_CHAR)A->seq_al[a][b]='.';
7089 fp=vfopen (file, "w");
7090 fprintf ( fp, "# STOCKHOLM 1.0\n\n");
7091 output_generic_interleaved_aln (fp,A, 50, '.', NULL);
7092 fprintf ( fp, "//\n");
7096 void output_glalign ( char *name, Alignment *B, Alignment *S)
7103 if ( B==NULL){return;}
7105 fp=vfopen (name, "w");
7106 fprintf (fp, "Format: GLALIGN_01 [Generated with %s ]\n", PROGRAM);
7107 fprintf (fp, "#Each Line corresponds to a column\n");
7108 fprintf (fp, "#First column coresponds to first genome\n");
7109 fprintf (fp, "#Last Column gives the column reliability on a 0-9 scale\n");
7110 fprintf (fp, "#[-1] Indicates that the reliability was not evaluated\n");
7112 fprintf (fp, "Genome List\n");
7113 for ( a=0; a< B->nseq; a++)
7114 fprintf (fp, "\tGenome %s\n", B->name[a]);
7115 fprintf (fp, "Alignment List\n");
7118 fprintf (fp, "Alignment %d Len %d Score %d\n", ++naln, B->len_aln, S->score_aln);
7119 nr=duplicate_int (B->order, -1, -1);
7120 for ( a=0; a< B->len_aln; a++)
7122 fprintf ( fp, "\t");
7123 for ( b=0; b< B->nseq; b++)
7125 g=is_gap (B->seq_al[b][a]);
7128 if (g)fprintf (fp, "---- ");
7129 else fprintf ( fp, "%4d ",nr[b][1]);
7131 s=((S)?S->seq_al[S->nseq][a]:-1);
7132 if (s==NO_COLOR_RESIDUE)s=-1;
7133 fprintf ( fp,"[ %d ]",s);
7134 fprintf ( fp, "\n");
7143 Alignment *input_conc_aln ( char *name, Alignment *IN)
7146 char *string, *p, *file;
7147 Alignment *F=NULL,*A=NULL, *B=NULL;
7149 file=vtmpnam (NULL);
7151 string=file2string(name);
7152 string=substitute ( string, "@", "!Protected!");
7153 string=substitute ( string, TC_REC_SEPARATOR, "@");
7154 strtok (string,"@");
7157 while ( (p=strtok (NULL,"@"))!=NULL)
7160 buf=vcalloc ( strlen (p)+1, sizeof (char));
7161 sprintf (buf,"%s", p);
7162 buf=substitute (buf,"!protected!", "@");
7164 fp=vfopen (file, "w");
7165 fprintf ( fp, "%s",buf);
7171 B=main_read_aln (file,NULL);
7175 if (IN){copy_aln (B, IN);F=A=IN;}
7190 void output_conc_aln ( char *name, Alignment *B)
7195 fp=vfopen (name, "w");
7196 fprintf (fp, "# CONC_MSF_FORMAT_01\n");
7199 fprintf (fp, "%s\n", TC_REC_SEPARATOR);
7200 for ( a=0; a< B->nseq; a++)
7202 fprintf ( fp, ">%s\n%s\n", B->name[a], B->seq_al[a]);
7210 void output_lalign ( char *name, Alignment *B)
7212 static int output_header;
7215 if ( B==NULL){output_header=0;return;}
7216 else if ( output_header==0)
7218 output_lalign_header(name, B);
7223 output_lalign_aln ( name, B);
7227 void output_lalign_aln ( char *name, Alignment *B)
7229 int a, b, c,d=0, s=0;
7242 n_residues=vcalloc ( B->nseq+1, sizeof (int));
7243 for ( a=0; a< B->nseq; a++)
7244 {if ( strlen (B->name[a])>max_len)
7245 max_len= strlen ( (B->name[a]));
7246 n_residues[a]=B->order[a][1];
7248 max_len=MAX(max_len+2, 16);
7253 fp= vfopen ( name, "a");
7255 for (a=0; a< B->len_aln; a++)
7257 if ( !is_gap(B->seq_al[0][a]) && !is_gap(B->seq_al[1][a]))
7260 id+=(B->seq_al[0][a]==B->seq_al[1][a]);
7265 fprintf (fp, " %.1f%% identity in %d aa overlap; score: %d\n\n", id,(int)tot, B->score_aln);
7268 for (a=0; a<B->len_aln; a+=line)
7269 {for (b=0; b<5; b++)
7275 fprintf (fp,"%-*s",max_len," ");
7276 for (d=0,c=a;c<a+line && c<B->len_aln;c++)
7278 res=!is_gap ( B->seq_al[s][c]);
7280 if ( (n_residues[s]%10)==0 && res && (c-a+4)<line){fprintf (fp, "%-4d", n_residues[s]);d=-3;}
7283 if ( d==0)fprintf (fp, " ");
7289 else if (b==1 || b==3)
7293 fprintf (fp,"%-*s",max_len, B->name[s]);
7294 for (c=a;c<a+line && c<B->len_aln;c++)
7296 if ( is_gap(B->seq_al[s][c]))fprintf (fp,"%c", '-');
7299 fprintf (fp, "%c", GET_CASE(B->residue_case, B->seq_al[s][c]));
7306 fprintf (fp,"%-*s",max_len," ");
7307 for (c=a;c<a+line && c<B->len_aln;c++)
7309 col=analyse_aln_column (B, c);
7310 if ( col=='*')col=':';
7311 else if ( col==':')col='.';
7312 else if ( col=='.')col=' ';
7313 fprintf ( fp, "%c", col);
7321 fprintf (fp,"\n\n----------\n\n");
7327 /****************************************************************************************************/
7328 /*************************************UTIL *********************************************************/
7329 /**************************************************************************************************/
7332 /****************************************************************************************************/
7333 /*************************** *************************************/
7334 /*************************** PROCESSING *************************************/
7335 /*************************** *************************************/
7336 /*******************************************************************************************/
7340 /***************************************************************************************** */
7342 char *thread_aa_seq_on_dna_seq( char *s)
7349 array=vcalloc ( l*3 +1, sizeof (char));
7350 for ( b=0, c=0; b< l; b++, c+=3)
7360 Alignment *thread_dnaseq_on_prot_aln (Sequence *S, Alignment *A)
7363 int a, b, c, n, la, ls, ln, m;
7366 B=realloc_aln2 ( B, B->nseq, B->len_aln*3 +1);
7368 for ( n=0,a=0; a< A->nseq; a++)
7370 for ( m=0,b=0; b< S->nseq; b++)
7372 if (strm (A->name[a], S->name[b]) )
7377 B->seq_al[a][0]='\0';
7378 for (la=0, ls=0, ln=0; la< A->len_aln; la++)
7380 for (c=0; c< 3; c++)
7381 B->seq_al[a][ls++]=(is_gap(A->seq_al[a][la]))?'-':S->seq[b][ln++];
7383 B->seq_al[a][ls]='\0';
7388 for (la=0, ls=0, ln=0; la< A->len_aln; la++)
7391 B->seq_al[a][ls++]=A->seq_al[a][la];
7392 B->seq_al[a][ls++]='-';
7393 B->seq_al[a][ls++]='-';
7398 B->len_aln=strlen ( B->seq_al[0]);
7401 void thread_seq_struc2aln ( Alignment *A, Sequence *ST)
7406 for ( a=0; a< A->nseq; a++)
7407 for ( b=0; b< ST->nseq; b++)
7409 if ( strcmp ( A->name[a], ST->name[b])==0)
7412 len=strlen(A->seq_al[a]);
7413 for ( c=0, d=0; c<len; c++)
7415 if ( !is_gap(A->seq_al[a][c]))A->seq_al[a][c]=ST->seq[b][d++];
7419 cons=name_is_in_list ("Cons", ST->name, ST->nseq, 100);
7420 if ( cons!=-1 && A->len_aln==strlen ( ST->seq[cons]))
7422 sprintf (A->name[A->nseq], "Cons");
7423 sprintf (A->seq_al[A->nseq],"%s", ST->seq[cons]);
7427 void cache_id ( Alignment *A)
7432 for ( a=0; a< A->len_aln; a++)
7434 for ( b=0, n=0; b< A->nseq; b++)if ( !is_gap(A->seq_al[b][a]))n++;
7435 for ( b=0; b< A->nseq; b++)
7436 if ( !is_gap(A->seq_al[b][a]) && n==A->nseq)A->seq_al[b][a]='h';
7437 else if( !is_gap(A->seq_al[b][a]))A->seq_al[b][a]='x';
7439 for ( a=0; a< A->nseq; a++)
7441 for ( b=1; b< A->len_aln-1; b++)
7443 r1=A->seq_al[a][b-1];
7445 r3=A->seq_al[a][b+1];
7448 if ( (r1=='h' || r1=='b') && (r3=='h' || r3=='b'))A->seq_al[a][b]='h';
7449 else A->seq_al[a][b]='b';
7452 for ( b=1; b< A->len_aln-1; b++)if ( A->seq_al[a][b]=='b')A->seq_al[a][b]='x';
7458 /*******************************************************************************************/
7461 /* PROCESING OF EST */
7463 /***************************************************************************************** */
7464 int process_est_sequence ( Sequence *S, int *cluster_list)
7466 char **inverted_seq;
7474 char buf [VERY_LONG_STRING];
7478 sens=declare_int ( S->nseq,S->nseq);
7479 a_sens=declare_int ( S->nseq,S->nseq);
7480 best=declare_int ( S->nseq,S->nseq);
7483 inverted_seq=vcalloc ( S->nseq, sizeof (char*));
7484 for ( a=0; a<S->nseq; a++)
7485 inverted_seq[a]=invert_seq ( S->seq[a]);
7487 for ( a=0; a< S->nseq-1; a++)
7490 for ( b=a+1; b<S->nseq; b++)
7493 V1=sens[a][b]=sens[b][a]=get_best_match ( S->seq[a], S->seq[b]);
7494 V2=a_sens[a][b]=a_sens[b][a]=get_best_match ( S->seq[a],inverted_seq[b]);
7495 best[a][b]=best[b][a]=(V1>V2)?V1:V2;
7498 solution=SHC ( S->nseq, a_sens, sens);
7501 for ( a=0; a<S->nseq; a++)cluster_list[a]=-1;
7502 for ( a=0; a<S->nseq; a++)
7504 n=search_for_cluster (a, n_clusters, cluster_list, T, S->nseq, best);
7505 if ( n>0)n_clusters++;
7507 fprintf ( stderr, "\nTHERE %s %d Independant Cluster(s) in your sequences",(n_clusters>1)?"are":"is",(n_clusters));
7508 for (a=0; a<n_clusters; a++)
7510 fprintf (stderr, "\n");
7511 for ( b=0; b<S->nseq; b++)
7513 if ( cluster_list[b]==a)fprintf ( stderr, "%s ", S->name[b]);
7517 for ( a=0; a<S->nseq; a++)
7519 if ( solution[a]==-1)
7521 S->seq[a]=inverted_seq[a];
7522 sprintf ( buf, "i_%s", S->name[a]);
7523 sprintf ( S->name[a], "%s", buf);
7529 int search_for_cluster ( int seq, int cluster_number, int *cluster_list, int T, int nseq, int **S)
7533 if (cluster_list[seq]==-1)
7535 cluster_list[seq]=cluster_number;
7538 for ( a=0; a<nseq; a++)
7539 if ( cluster_list[a]==-1)
7545 cluster_list[a]=cluster_number;
7546 n+=search_for_cluster ( a, cluster_number, cluster_list, T, nseq, S);
7552 int * SHC ( int nseq, int **NST, int **ST)
7556 int score, new_score;
7557 int N_IT=VERY_LONG_STRING;
7561 sol=vcalloc ( nseq, sizeof (int));
7562 for ( a=0; a<nseq; a++)
7563 sol[a]=(addrand ((unsigned long)100)>49)?1:-1;
7565 score=evaluate_sol (sol, nseq, ST, NST);
7566 fprintf ( stderr, "\nI_Score=%d\n", score);
7569 for ( count=0,a=0; a< N_IT && score<VERY_LONG_STRING; a++, count++)
7571 mut=mutate_sol ( sol,nseq);
7572 new_score=evaluate_sol (sol, nseq, ST, NST);
7573 if ( new_score>score)
7577 else if ( (addrand ((unsigned long)VERY_LONG_STRING))>score)
7582 sol[mut]=sol[mut]*-1;
7583 if ( count==VERY_LONG_STRING)
7586 fprintf ( stderr, "\nScore=%d", score);
7589 fprintf ( stderr, "\nScore=%d\n", score);
7593 int mutate_sol (int *sol, int nseq)
7596 n=addrand ((unsigned long)nseq);
7600 int evaluate_sol ( int *sol, int nseq, int **ST, int **NST)
7602 static int max_score;
7607 for ( a=0; a<nseq-1; a++)
7608 for ( b=a+1; b<nseq; b++)
7610 max_score+=(ST[a][b]>NST[a][b])?ST[a][b]:NST[a][b];
7614 for ( a=0; a<nseq-1; a++)
7615 for (b=a+1; b<nseq; b++)
7616 if ( (sol[a]*sol[b])<0)score+=NST[a][b];
7617 else score+=ST[a][b];
7618 return (score*VERY_LONG_STRING)/max_score;
7622 char * invert_seq ( char *seq)
7631 for ( a=0; a<l; a++)
7632 seq[a]=tolower ( seq[a]);
7633 nseq=vcalloc ( l+1, sizeof (char));
7635 for ( a=0, b=l-1; a<l; a++, b--)
7637 if (seq[b]=='n')nseq[a]='n';
7638 else if (seq[b]=='g')nseq[a]='c';
7639 else if (seq[b]=='c')nseq[a]='g';
7640 else if (seq[b]=='a')nseq[a]='t';
7641 else if (seq[b]=='t')nseq[a]='a';
7649 int get_best_match ( char *seq1, char *seq2)
7665 m=declare_int (ml, ml);
7667 else if ( (ml<l1) || (ml<l2))
7671 m=declare_int (ml, ml);
7674 for ( a=0; a<l1; a++)
7676 for ( b=0; b<l2; b++)
7677 m[a][b]=((seq1[a]==seq2[b])|| seq1[a]=='n' ||seq2[b]=='n')?1:0;
7679 mdiag= extract_m_diag_streches ( m, l1, l2,seq1, seq2, &n_mdiag);
7681 for ( best=0,a=0; a<n_mdiag; a++)
7682 best=(mdiag[a][0]>best)?mdiag[a][0]:best;
7687 int** extract_m_diag_streches ( int ** m, int l1, int l2,char *seq1, char *seq2, int *n_mdiag)
7690 int b, x, y, s1, s2;
7693 static int max_diag=VERY_LONG_STRING;
7704 mdiag=declare_int ( max_diag, 5);
7706 for ( s1=l1-1, s2=0;s2<l2;)
7708 for ( in=0,x=s1, y=s2; x<l1 && y<l2; x++, y++)
7713 mdiag[n_mdiag[0]][0]++;
7716 mdiag[n_mdiag[0]][0]=1;
7717 mdiag[n_mdiag[0]][1]=x;
7718 mdiag[n_mdiag[0]][2]=y;
7726 mdiag[n_mdiag[0]][3]=x-1;
7727 mdiag[n_mdiag[0]][4]=y-1;
7728 if ( !is_strech ( "ta", seq1, seq2,mdiag[n_mdiag[0]][0], mdiag[n_mdiag[0]][1],mdiag[n_mdiag[0]][2]))n_mdiag[0]++;
7730 if (n_mdiag[0]==(max_diag-1))
7731 {mdiag=vrealloc (mdiag, (max_diag+VERY_LONG_STRING)*sizeof (int*));
7732 for ( b=max_diag; b<max_diag+VERY_LONG_STRING; b++)mdiag[b]=vcalloc ( 5, sizeof (int));
7733 max_diag+=VERY_LONG_STRING;
7741 mdiag[n_mdiag[0]][3]=x-1;
7742 mdiag[n_mdiag[0]][4]=y-1;
7743 if ( !is_strech ( "ta", seq1, seq2,mdiag[n_mdiag[0]][0], mdiag[n_mdiag[0]][1],mdiag[n_mdiag[0]][2]))n_mdiag[0]++;
7749 int is_strech ( char *AA, char *seq1, char *seq2, int len, int x, int y)
7751 int n, i, j, c,a,nr;
7755 for ( a=0; a<n; a++)
7757 for (nr=0, i=x, j=y, c=0; c<len; c++, i++, j++)
7758 if ((seq1[i]==AA[a]) && (seq2[j]==AA[a]))nr++;
7759 if ( ((nr*100)/len)>T)return 1;
7765 /************************************************************************************/
7770 /************************************************************************************/
7772 char * oneletaa2threeletaa(char aa);
7773 float aa2property (char aa, char *mode);
7775 int output_seq2struc(char *outfile, Alignment *A)
7779 float v, h, x, y, z, dx, dy, dz;
7781 char *tmpfile1, *tmpfile2;
7784 tmpfile1=vtmpnam(NULL);
7785 tmpfile2=vtmpnam(NULL);
7787 ungap (A->seq_al[0]);
7788 s=A->seq_al[0];l=strlen (s);
7789 fp1=vfopen (tmpfile1, "w");
7792 for ( a=0; a< l; a++)
7794 h=aa2property ( s[a], "doolittle" );
7795 v=aa2property (s[a], "volume");
7796 /*14.398907: peptide bond length*/
7797 dx=(float)sqrt ((double)(14.398907/(((h*h)/(v*v))+1)));
7805 fprintf (fp1, "ATOM%7d CA %s A%4d%12.3f%8.3f%8.3f 1.00 5.30\n",a+1, oneletaa2threeletaa(s[a]),a+1, x, y, z);
7808 sprintf ( command, "extract_from_pdb -infile %s -force > %s", tmpfile1, tmpfile2);
7809 my_system (command);
7810 fp1=vfopen (tmpfile2, "r");
7811 fp2=vfopen (outfile, "w");
7813 while ( (c=fgetc(fp1))!=EOF)fprintf (fp2, "%c", c);
7820 char * oneletaa2threeletaa(char aa)
7823 if ( aa=='a')return "ALA";
7824 else if ( aa=='r') return "ARG";
7825 else if ( aa=='n') return "ASN";
7826 else if ( aa=='d') return "ASP";
7827 else if ( aa=='c') return "CYS";
7828 else if ( aa=='q') return "GLN";
7829 else if ( aa=='e') return "GLU";
7830 else if ( aa=='g') return "GLY";
7831 else if ( aa=='h') return "HIS";
7832 else if ( aa=='i') return "ILE";
7833 else if ( aa=='l') return "LEU";
7834 else if ( aa=='k') return "LYS";
7835 else if ( aa=='m') return "MET";
7836 else if ( aa=='f') return "PHE";
7837 else if ( aa=='p') return "PRO";
7838 else if ( aa=='s') return "SER";
7839 else if ( aa=='t') return "THR";
7840 else if ( aa=='w') return "TRP";
7841 else if ( aa=='y') return "TYR";
7842 else if ( aa=='v') return "VAL";
7845 fprintf ( stderr, "\nERROR: %c is not an amino acid [FATAL::aa2hydropathy::%s]", aa, PROGRAM);
7846 myexit (EXIT_FAILURE);
7852 float aa2property (char aa, char *mode)
7854 if ( mode==NULL || strm (mode, "doolittle"))
7857 if ( aa=='i')return 4.5;
7858 else if ( aa=='v') return 4.2;
7859 else if ( aa=='l') return 3.8;
7860 else if ( aa=='f') return 2.8;
7861 else if ( aa=='c') return 2.5;
7862 else if ( aa=='m') return 1.9;
7863 else if ( aa=='a') return 1.8;
7864 else if ( aa=='g') return -0.4;
7865 else if ( aa=='t') return -0.7;
7866 else if ( aa=='w') return -0.9;
7867 else if ( aa=='s') return -0.8;
7868 else if ( aa=='y') return -1.3;
7869 else if ( aa=='p') return -1.6;
7870 else if ( aa=='h') return -3.2;
7871 else if ( aa=='e') return -3.5;
7872 else if ( aa=='q') return -3.5;
7873 else if ( aa=='d') return -3.5;
7874 else if ( aa=='n') return -3.5;
7875 else if ( aa=='k') return -3.9;
7876 else if ( aa=='r') return -4.5;
7879 fprintf ( stderr, "\nERROR: %c is not an amino acid [FATAL::aa2hydropathy::%s]", aa, PROGRAM);
7880 myexit (EXIT_FAILURE);
7883 else if (strm (mode, "volume"))
7886 if ( aa=='a')return 0.915;
7887 else if ( aa=='r') return 2.02;
7888 else if ( aa=='n') return 1.35;
7889 else if ( aa=='d') return 1.24;
7890 else if ( aa=='c') return 1.18;
7891 else if ( aa=='q') return 1.61;
7892 else if ( aa=='e') return 1.55;
7893 else if ( aa=='g') return 0.66;
7894 else if ( aa=='h') return 1.67;
7895 else if ( aa=='i') return 1.69;
7896 else if ( aa=='l') return 1.68;
7897 else if ( aa=='k') return 1.71;
7898 else if ( aa=='m') return 1.70;
7899 else if ( aa=='f') return 2.03;
7900 else if ( aa=='p') return 1.29;
7901 else if ( aa=='s') return 0.99;
7902 else if ( aa=='t') return 1.22;
7903 else if ( aa=='w') return 2.37;
7904 else if ( aa=='y') return 2.03;
7905 else if ( aa=='v') return 1.41;
7908 fprintf ( stderr, "\nERROR: %c is not an amino acid [FATAL::aa2hydropathy::%s]", aa, PROGRAM);
7909 myexit (EXIT_FAILURE);
7915 fprintf ( stderr, "\nERROR: %s is an unknown mode [FATAL::aa2hydropathy::%s]", mode , PROGRAM);
7916 myexit (EXIT_FAILURE);
7925 /************************************************************************************/
7930 /************************************************************************************/
7932 Alignment *code_dna_aln (Alignment *A)
7936 for ( a=0; a< A->nseq; a++)
7938 for (l=0, b=0; b< A->len_aln; b++)
7942 else if ( r=='~')continue;
7943 else if ( r=='.')l++;
7944 else if ( !islower(r))A->seq_al[a][b]='4';
7947 A->seq_al[a][b]=(l+3)%3+'0';
7956 Alignment *back_translate_dna_aln (Alignment *A)
7958 /*Given a set of aligned sequences
7959 starts from left to right
7961 ambiguities are randomly resolved.
7962 returns the corresponding amino acid alignment
7968 A=realloc_aln (A, 10000);
7969 seq=vcalloc ( 10000, sizeof (char));
7972 for ( a=0; a< A->nseq; a++)
7974 seq=back_translate_dna_seq (A->seq_al[a], seq, RANDOM);
7975 sprintf ( A->seq_al[a], "%s", seq);
7977 A->len_aln=A->len_aln*3;
7982 char * back_translate_dna_seq ( char *in_seq,char *out_seq, int mode)
7988 if (out_seq==NULL)out_seq=vcalloc ( len*3+1, sizeof (char));
7991 for (a=0; a<len; a++)
7993 strcat (out_seq, back_translate_dna_codon (in_seq[a],mode));
7999 static Sequence *rna_seq2dna_seq (Sequence *S);
8000 static Sequence *dna_seq2rna_seq (Sequence *S);
8001 Sequence * transform_sequence ( Sequence *S, char *mode)
8003 if ( strm (mode, "rna2dna"))
8004 return rna_seq2dna_seq (S);
8005 else if ( strm (mode, "dna2rna"))
8006 return dna_seq2rna_seq (S);
8008 printf_exit (EXIT_FAILURE, stderr, "Unknown -transform mode: %s [FATAL:%s]\n", mode,PROGRAM);
8011 Sequence *rna_seq2dna_seq (Sequence *S)
8015 if ( !strm(S->type, "DNA") && !strm (S->type, "RNA")) printf_exit (EXIT_FAILURE, stderr, "Sequences should be *RNA* type [FATAL:%s]\n", PROGRAM);
8016 for ( a=0; a<S->nseq; a++)
8018 for (b=0; b<strlen (S->seq[a]); b++)
8020 if ( S->seq[a][b]=='u') S->seq[a][b]='t';
8021 if ( S->seq[a][b]=='U') S->seq[a][b]='T';
8023 HERE ("%s", S->seq[a]);
8027 Sequence *dna_seq2rna_seq (Sequence *S)
8031 if ( !strm(S->type, "DNA") && !strm (S->type, "RNA")) printf_exit (EXIT_FAILURE, stderr, "Sequences should be *DNA* type (type=%s) [FATAL:%s]\n", PROGRAM, S->type);
8032 for ( a=0; a<S->nseq; a++)
8033 for (b=0; b<S->len[a]; b++)
8035 if ( S->seq[a][b]=='t') S->seq[a][b]='u';
8036 if ( S->seq[a][b]=='T') S->seq[a][b]='U';
8043 int get_longest_frame (char *seq, int mode);
8044 Sequence *translate_dna_seqS (Sequence *S, int frame, int stop)
8050 for (a=0; a<S->nseq; a++)
8055 for (b=(frame-1); b<l; b+=3)
8057 s[b]=translate_dna_codon (s+b,stop);
8058 for (c=b+1; c<b+3 && c<l; c++)s[c]='-';
8063 Alignment *translate_dna_aln (Alignment *A, int frame)
8065 /*Given a set of aligned sequences
8066 starts from left to right
8068 2nuc+1gap, 1nuc+2gap->3 gaps
8070 returns the corresponding amino acid alignment
8077 if (frame==3 || frame ==4)
8080 for (a=0; a< A->nseq; a++)
8084 f=get_longest_frame (d,frame);
8085 buf=vcalloc ( strlen (d)+1, sizeof (char));
8088 sprintf (buf, "%s", d+f);
8089 sprintf (d, "%s", buf);
8090 sprintf (A->seq_comment[a], " frame: %d", f);
8095 sprintf ( buf, "%s", d);
8096 buf=complement_string (buf);
8097 sprintf (d, "%s",buf+f);
8098 sprintf (A->seq_comment[a], " frame: %d Reverse Complement", f);
8106 for ( a=0; a< A->nseq; a++)
8107 for (b=0; b< frame; b++)
8108 A->seq_al[a][b]='-';
8112 for ( b=0; b< A->nseq; b++)
8113 for ( a=0; a< A->len_aln;)
8116 r=translate_dna_codon (A->seq_al[b]+a, 'z');
8119 A->seq_al[b][a++]='-';
8120 A->seq_al[b][a++]='-';
8121 A->seq_al[b][a++]='-';
8125 A->seq_al[b][a++]='o';
8126 A->seq_al[b][a++]='-';
8127 A->seq_al[b][a++]='-';
8131 A->seq_al[b][a++]='x';
8132 A->seq_al[b][a++]='-';
8133 A->seq_al[b][a++]='-';
8137 A->seq_al[b][a++]=r;
8138 A->seq_al[b][a++]='-';
8139 A->seq_al[b][a++]='-';
8148 int seq2tblastx_db (char *out,Sequence *S, int strand)
8150 //strand : same values as in ncbi blastall
8158 fp=vfopen (out, "w");
8159 for (a=0; a<S->nseq; a++)
8161 for (b=-3; b<=3; b++)
8166 dl=strlen (S->seq[a]);
8168 else if ( strand==1 && b<0)continue;//only direct strand
8169 else if ( strand==2 && b>0)continue;//only reverse strand
8172 S->seq[a]=complement_string (S->seq[a]);
8177 prot=translate_dna_seq (S->seq[a], f, 'X', NULL);
8178 upper_string (prot);
8182 for (pprot=prot,d=0; d<=l; d++)
8187 if ( strlen (pprot)>min_exon_len)
8191 start=(end-(strlen (pprot))*3)+1;
8192 fprintf (fp, ">%s__%c__%d__%d__%d\n%s\n", S->name[a],(b>0)?'d':'r',start,end,dl, pprot);
8198 if (b<0) S->seq[a]=complement_string (S->seq[a]);
8202 return EXIT_SUCCESS;
8205 int get_longest_frame (char *in_seq, int mode)
8213 seq=vcalloc (strlen (in_seq)+1, sizeof (char));
8214 prot=vcalloc (strlen (in_seq)+1, sizeof (char));
8215 sprintf ( seq, "%s", in_seq);
8217 if ( mode == 3)nf=3;
8218 else if ( mode == 4) nf=6;
8220 for (a=0; a<nf; a++)
8223 if (a==3)seq=complement_string (seq);
8225 prot=translate_dna_seq ( seq,f,'\0', prot);
8227 if (l>=max_l){max_l=l;best_frame=a;}
8234 Alignment *clean_gdna_aln (Alignment *A)
8236 int a, b, c, r1, r2,s, p, n, tn;
8244 /*Viterbi Parameters*/
8245 int AL=0; /*Allowed Transition*/
8246 int F=-1000000; /*Forbiden Transition*/
8247 int SPLICE_PENALTY=100;
8248 int ORF1=0, ORF2=1, ORF3=2, NC=3;
8250 int state, pstate, best_e, best_pstate_p,best_state_p, best_pstate_v, best_state_v, v;
8258 best_state_p=best_state_v=best_pstate_p=best_pstate_v=best_e=0;
8259 buffer=vcalloc ( 100000, sizeof (char));
8260 is_dna=vcalloc ( A->nseq, sizeof (int));
8261 score=declare_int ( A->nseq+1, A->len_aln);
8264 if ( !mat)mat=read_matrice("pam250mt");
8266 col=vcalloc ( A->nseq, sizeof (int));
8268 for (a=0; a<= A->len_aln; a++)
8269 for ( b=0; b< A->nseq; b++){A->seq_al[b][a]=tolower(A->seq_al[b][a]); A->seq_al[b][a]=(A->seq_al[b][a]=='t')?'u':A->seq_al[b][a];}
8271 for ( a=0; a< A->nseq; a++)
8273 sprintf ( buffer, "%s", A->seq_al[a]);
8275 is_dna[a]=strm ( get_string_type (buffer), "DNA");
8279 for (a=0; a< A->len_aln-2; a++)
8281 for (b=0; b< A->nseq; b++)
8283 if (is_dna[b])col[b]=translate_dna_codon (A->seq_al[b]+a, 'x');
8284 else col[b]=tolower ( A->seq_al[b][a]);
8287 for (n=0,tn=0,b=0; b< A->nseq; b++)
8288 for ( c=b; c< A->nseq; c++ )
8293 if (r1=='x' || r2=='x'){score[A->nseq][a]=F;break;}
8294 else if (r1=='-' && r2=='-');
8295 else if (r1=='-' || r2=='-');
8299 if ( is_dna[b] && is_dna[c])score[A->nseq][a]+= mat[r1-'A'][r2-'A'];
8300 else score[A->nseq][a]+=mat[r1-'A'][r2-'A']* (A->nseq*A->nseq);
8302 n+=( !is_gap(r1) && !is_gap(r2));
8303 score[A->nseq][a]=(((tn!=0)?score[A->nseq][a]/tn:0));
8310 transitions=declare_int ( nstate, nstate);
8311 v_tab=declare_int ( A->len_aln+2, nstate );
8312 v_tab_p=declare_int ( A->len_aln+2, nstate );
8314 for (a=0; a<nstate;a++)
8315 for (b=0; b<nstate;b++)
8316 {transitions[a][b]=F;}
8318 transitions[ORF1][ORF2]=AL;
8319 transitions[ORF2][ORF3]=AL;
8320 transitions[ORF3][ORF1]=AL;
8322 transitions[ORF3][NC] =AL-SPLICE_PENALTY;
8323 transitions[NC][ORF1] =AL-SPLICE_PENALTY;
8326 for ( s=0; s<A->nseq; s++)
8328 for ( p=0; p<=A->len_aln; p++){for (state=0; state< nstate; state++)v_tab_p[p][state]=-1; }
8329 for (p=1+2; p<= A->len_aln; p++)
8332 for (state=0; state< nstate; state++)
8335 if ( state==NC){e=-best_e;}
8338 e=score[A->nseq][(p-1)-state];
8339 if ( state==0)best_e=e;
8340 else best_e=MAX(e, best_e);
8343 for ( pstate=0; pstate<nstate; pstate++)
8345 v=e+transitions[pstate][state]+v_tab[p-1][pstate];
8346 if (pstate==0 ||(v>best_pstate_v) )
8349 best_pstate_p=pstate;
8353 v_tab[p][state]=best_pstate_v;
8354 v_tab_p[p][state]=best_pstate_p;
8355 if (state==0 ||best_pstate_v>best_state_v )
8358 best_state_v=best_pstate_v;
8366 for (p=0; p< A->len_aln; p++)T->seq_al[s][p]='.';
8367 for (p=A->len_aln; p>0; p--)
8370 if ( best_state_p==0)T->seq_al[s][p-1]=translate_dna_codon (A->seq_al[s]+(p-1), 'x');
8371 else if ( best_state_p==1 || best_state_p==2)T->seq_al[s][p-1]='-';
8375 best_state_p=v_tab_p[p][best_state_p];
8386 Alignment *clean_cdna_aln (Alignment *A)
8388 /*Given an alignmnet of nucleotides
8389 Returns the same alignmnent whith non coding nucleotides replaced with dots
8391 at each position, the emission probability is the sum of pair of the substitution of amino-acids
8404 /*Viterbi Parameters*/
8405 int AL=0; /*Allowed Transition*/
8406 int F=-1000000; /*Forbiden Transition*/
8408 int NC, C1,C2, C3, START, END;
8410 int state=0,best_state=0, score=0, best_score=0;
8429 buffer=vcalloc ( 100000, sizeof (char));
8430 emission=vcalloc (A->len_aln, sizeof (int));
8434 mat=read_matrice("pam250mt");
8437 /*Computation of the emission proba for the coding state*/
8440 for (a=0; a< A->len_aln; a++)
8443 /*First component: % occupancy of the column*/
8445 for ( b=0; b< A->nseq; b++) em1+=!is_gap(translate_dna_codon (A->seq_al[b]+a, '-'));
8446 em1=em1/(float)A->nseq;
8448 /*Second Component: % similarity within column*/
8450 for (n=0,b=0; b< A->nseq-1; b++)
8452 r1=translate_dna_codon (A->seq_al[b]+a, '-');
8454 for (c=b+1; c<A->nseq; c++)
8456 r2=translate_dna_codon (A->seq_al[c]+a, '-');
8457 if (is_gap(r2) || is_gap(r1));
8461 em2+=((mat[r1-'A'][r2-'A'])>1)?1:0;
8465 em2=em2/(float)((n==0)?1:n);
8468 emission[a]=(em1*100);
8476 transitions=declare_int ( nstate, nstate);
8477 score_tab=declare_int ( A->len_aln+2, nstate );
8478 state_tab=declare_int ( A->len_aln+2, nstate );
8480 for (a=0; a<nstate;a++)
8481 for (b=0; b<nstate;b++)
8482 {transitions[a][b]=F;}
8485 transitions[START][C1]=AL;
8486 transitions[START][NC]=AL;
8487 transitions[C3][END]=AL;
8488 transitions[NC][END]=AL;
8489 transitions[C1 ][C2 ]=AL;
8490 transitions[C2 ][C3 ]=AL;
8491 transitions[C3 ][C1 ]=AL;
8492 transitions[C3 ][NC ]=AL-PENALTY;
8493 transitions[NC ][C1 ]=AL-PENALTY;
8494 transitions[NC][NC]=AL-PENALTY;
8498 for ( s=0; s< A->nseq; s++)
8500 for ( p=0; p<=A->len_aln; p++){for (state=0; state< nstate; state++){score_tab[p][state]=F;state_tab[p][state]=-1;} }
8501 score_tab[0][START]=0;
8503 for (p=1; p<= A->len_aln; p++)
8505 for (state=0; state< nstate; state++)
8507 if ( state==START || state==END)continue;
8508 else if ( state==NC) e=-10;
8509 else if ( state==C1)
8513 else if ( state ==C2)
8516 else e=emission[p-2];
8518 else if ( state==C3)
8521 else e=emission[p-3];
8524 for (p_state=0; p_state<nstate; p_state++)
8530 score=(score_tab[p-1][p_state]==F)?F:(e+transitions[p_state][state]+score_tab[p-1][p_state]);
8533 if(p_state==0 || score>best_score){ best_score=score;best_state=p_state;}
8537 score_tab[p][state]=best_score;
8538 state_tab[p][state]=best_state;
8543 best_score=best_state=UNDEFINED;
8544 for (state=0; state<nstate; state++)
8546 if (state==START || state==END)continue;
8547 e=transitions[state][END];
8548 if (e==F || score_tab[p-1][state]==F)continue;
8550 if (best_score==UNDEFINED || score_tab[p-1][state]>best_score)
8552 best_score=score_tab[p-1][state]+e;
8558 for (p=A->len_aln; p>0;)
8560 B->seq_al[s][p-1]=best_state+'0';
8561 best_state=state_tab[p][best_state];
8566 for ( a=0; a< A->nseq; a++)
8567 for ( b=0; b< A->len_aln;)
8573 r2=A->seq_al[a][b+1];
8574 r3=A->seq_al[a][b+2];
8577 if ( is_gap(r1) ||is_gap(r2) || is_gap(r3))
8579 A->seq_al[a][b]=(is_gap(r1))?'~':'.';
8580 A->seq_al[a][b+1]=(is_gap(r2))?'~':'.';
8581 A->seq_al[a][b+2]=(is_gap(r3))?'~':'.';
8585 else if ( s==NC+'0')
8587 A->seq_al[a][b]=(is_gap(A->seq_al[a][b]))?'~':'.';
8592 fprintf (stderr, "\nPROBLEM: [%d %d]->%d", a, b, s-'0');
8598 free_int (transitions, -1);
8599 free_int (score_tab, -1);
8600 free_int (state_tab, -1);
8610 Alignment *translate_splice_dna_aln (Alignment *A, Alignment *ST)
8612 int a, b, c, r1, r2,s, p, n, tn;
8618 /*Viterbi Parameters*/
8619 int AL=0; /*Allowed Transition*/
8620 int F=-1000000; /*Forbiden Transition*/
8621 int ORF1=0, ORF2=1, ORF3=2,SPL1=3, SPL2=4, SPL3=5, SPL4=6, NC=7;
8623 int frame1, frame2, frame3, best_frame;
8629 int state=0, pstate=0, best_pstate_p=0,best_state_p=0, best_pstate_v=0, best_state_v=0, v=0;
8636 score=declare_int ( A->nseq+1, A->len_aln);
8639 if ( !mat)mat=read_matrice("pam250mt");
8641 col=vcalloc ( A->nseq, sizeof (int));
8643 for (a=0; a<= A->len_aln; a++)
8644 for ( b=0; b< A->nseq; b++){A->seq_al[b][a]=tolower(A->seq_al[b][a]); A->seq_al[b][a]=(A->seq_al[b][a]=='t')?'u':A->seq_al[b][a];}
8649 for (a=0; a< A->len_aln-2; a++)
8651 for (b=0; b< A->nseq; b++)
8653 col[b]=translate_dna_codon (A->seq_al[b]+a, 'x');
8656 for (n=0,tn=0,b=0; b< A->nseq-1; b++)
8657 for ( c=b+1; c< A->nseq; c++, tn++ )
8662 if (r1=='x' || r2=='x')score[A->nseq][a]=F;
8663 else if (r1=='-' && r2=='-');
8664 else if (r1=='-' || r2=='-');
8667 score[A->nseq][a]+= mat[r1-'A'][r2-'A'];
8670 n+=( !is_gap(r1) && !is_gap(r2));
8672 score[A->nseq][a]=(((tn!=0)?score[A->nseq][a]/tn:0));
8678 transitions=declare_int ( nstate, nstate);
8679 v_tab=declare_int ( A->len_aln+2, nstate*nstate);
8680 v_tab_p=declare_int ( A->len_aln+2, nstate*nstate);
8682 for (a=0; a<nstate;a++)
8683 for (b=0; b<nstate;b++)
8684 {transitions[a][b]=F;}
8686 SPLICE_PENALTY=-1000;
8688 transitions[ORF1][ORF2] =AL;
8689 transitions[ORF1][SPL1] =AL-SPLICE_PENALTY;
8691 transitions[ORF2][ORF3] =AL;
8692 transitions[ORF2][SPL1] =AL-SPLICE_PENALTY;
8694 transitions[ORF3][ORF1] =AL;
8695 transitions[ORF3][SPL1] =AL-SPLICE_PENALTY;
8697 transitions[ORF3][ORF1] =AL;
8698 transitions[ORF3][SPL1] =AL-SPLICE_PENALTY;
8700 transitions[ORF3][NC]=AL-100;
8701 transitions[NC][ORF1]=AL-100;
8704 transitions[SPL1][SPL2]=AL;
8705 transitions[SPL2][NC ]=AL-SPLICE_PENALTY;
8706 transitions[NC ][NC ]=AL;
8707 transitions[NC ][SPL3]=AL-SPLICE_PENALTY;
8708 transitions[SPL3][SPL4]=AL;
8709 transitions[SPL4][ORF1]=AL;
8710 transitions[SPL4][ORF2]=AL;
8711 transitions[SPL4][ORF3]=AL;
8714 for ( s=0; s<A->nseq; s++)
8716 for ( p=0; p<=A->len_aln; p++){for (state=0; state< nstate; state++)v_tab_p[p][state]=-1; }
8717 for (p=1+2; p<= A->len_aln; p++)
8719 frame1=score[A->nseq][(p-1)];
8720 frame2=score[A->nseq][(p-1)-1];
8721 frame3=score[A->nseq][(p-1)-2];
8722 best_frame=best_int (3, 1, &a, frame1, frame2, frame3);
8723 for (state=0; state< nstate; state++)
8725 r=tolower (A->seq_al[s][p-1]);
8728 if (state==ORF1)e=frame1;
8729 else if (state==ORF2)e=frame2;
8730 else if (state==ORF3)e=frame3;
8731 else if (state==SPL1)e=(r=='g')?best_frame:F;
8732 else if (state==SPL2)e=(r=='t')?best_frame:F;
8733 else if (state==SPL3)e=(r=='a')?best_frame:F;
8734 else if (state==SPL4)e=(r=='g')?best_frame:F;
8735 else if (state==NC)e=-best_frame;
8736 for ( pstate=0; pstate<nstate; pstate++)
8738 v=e+transitions[pstate][state]+v_tab[p-1][pstate];
8739 if (pstate==0 ||(v>best_pstate_v) ){best_pstate_v=v;best_pstate_p=pstate;}
8742 v_tab[p][state]=best_pstate_v;
8743 v_tab_p[p][state]=best_pstate_p;
8744 if (state==0 ||best_pstate_v>best_state_v ){best_state_p=state; best_state_v=best_pstate_v;}
8750 for (p=0; p< A->len_aln; p++)T->seq_al[s][p]='.';
8751 for (p=A->len_aln; p>0; p--)
8753 if ( best_state_p==0)T->seq_al[s][p-1]=toupper(translate_dna_codon (A->seq_al[s]+(p-1), 'x'));
8754 else if ( best_state_p>=SPL1 && best_state_p<=SPL4)T->seq_al[s][p-1]='-';
8755 best_state_p=v_tab_p[p][best_state_p];
8765 Alignment * mutate_cdna_aln ( Alignment *A)
8770 int neutral_substitution=50;
8771 int random_substitution=0;
8772 int random_deletion=0;
8773 int amino_acid_deletion=0;
8774 int amino_acid_substitution=0;
8775 char nuc_list[]="agct";
8778 neutral_substitution=atoi(get_env_variable ("NEUTRAL_SUBSTITUTION",IS_FATAL));
8779 random_substitution =atoi(get_env_variable ("RANDOM_SUBSTITUTION", IS_FATAL));
8780 random_deletion =atoi(get_env_variable ("RANDOM_DELETION", IS_FATAL));
8781 amino_acid_deletion =atoi(get_env_variable ("AMINO_ACID_DELETION", IS_FATAL));
8782 amino_acid_substitution =atoi(get_env_variable ("AMINO_ACID_SUBSTITUTION", IS_FATAL));
8785 if (A->S)free_sequence ( A->S, (A->S)->nseq);
8788 addrandinit(time (NULL));
8791 pos=aln2pos_simple ( A, A->nseq);
8793 /* 1 Apply neutral substitutions */
8795 if ( neutral_substitution)
8797 for ( c=0; c< neutral_substitution; c++)
8799 for ( a=0; a< A->nseq; a++)
8802 for ( b=0; b< A->len_aln; b++)
8805 if (pos[a][b]<=0)continue;
8806 ps=MAX(0,pos[a][b]-(pos[a][b]-1)%3-1);
8809 n1=(A->S)->seq[a][pos[a][b]-1];
8810 r1=translate_dna_codon ( (A->S)->seq[a]+ps, 'o');
8812 n2=nuc_list[(int)addrand((unsigned long) 4)];
8813 (A->S)->seq[a][pos[a][b]-1]=n2;
8814 r2=translate_dna_codon ( (A->S)->seq[a]+ps, 'o');
8817 if ( r1==r2 && r1!='o')A->seq_al[a][b]=n2;
8819 else (A->S)->seq[a][pos[a][b]-1]=n1;
8825 /* 2 Apply substitutions */
8826 if ( random_substitution)
8828 for ( a=0; a< A->nseq; a++)
8830 for ( b=0; b< A->len_aln; b++)
8832 if (pos[a][b]<=0)continue;
8833 if (addrand ((unsigned long) 100)>random_substitution)continue;
8835 n1=nuc_list[(int)addrand((unsigned long)4)];
8836 (A->S)->seq[a][pos[a][b]-1]=n1;
8842 /* 3 Apply amino acid substitutions */
8843 if ( amino_acid_substitution)
8845 for ( a=0; a< A->nseq; a++)
8847 for ( b=0; b< A->len_aln; b+=3)
8849 if (pos[a][b]<=0)continue;
8850 if (addrand ((unsigned long) 100)>amino_acid_substitution)continue;
8851 ps=MAX(0,pos[a][b]-(pos[a][b]-1)%3-1);
8853 r1=translate_dna_codon ( (A->S)->seq[a]+ps, 'o');
8854 new_codon=mutate_amino_acid(r1, "clustalw_col");
8856 for ( c=ps; c<ps+3; c++)(A->S)->seq[a][c]=new_codon[c-ps];
8858 for ( b=0; b< A->len_aln; b++)
8860 if (pos[a][b]<=0)continue;
8861 else A->seq_al[a][b]=(A->S)->seq[a][pos[a][b]-1];
8865 /* 3 Apply amino acid deletions */
8866 if ( amino_acid_deletion)
8868 for ( a=0; a< A->nseq; a++)
8870 for ( b=0; b< A->len_aln; b+=3)
8872 if (pos[a][b]<=0)continue;
8873 if (addrand ((unsigned long) 1000)>amino_acid_deletion)continue;
8874 ps=MAX(0,pos[a][b]-(pos[a][b]-1)%3-1);
8875 n=addrand ((unsigned long) 4)+1;
8877 for ( c=ps; c<ps+(3*n) && c<A->len_aln; c++)(A->S)->seq[a][c]='-';
8879 for ( b=0; b< A->len_aln; b++)
8881 if (pos[a][b]<=0)continue;
8882 else A->seq_al[a][b]=(A->S)->seq[a][pos[a][b]-1];
8886 /* 4 Apply amino acid insertions */
8888 /*FRAMESHIFT MUTATIONS*/
8889 /* 5 Apply nucleotide deletions*/
8890 if ( random_deletion)
8892 for ( a=0; a< A->nseq; a++)
8894 for ( b=0; b< A->len_aln; b++)
8896 if (pos[a][b]<=0)continue;
8897 if (addrand ((unsigned long) 1000)>random_deletion)continue;
8900 (A->S)->seq[a][pos[a][b]-1]=n1;
8905 /* 6 Apply nucleotide deletions*/
8911 Alignment* clean_est ( Alignment *A)
8913 /*Rules are as follow:
8914 Internal Gap > 30% Requences ----> -
8915 Best Residue < 50% Residues ----> 'N'
8921 for ( a=0; a< A->len_aln; a++)
8924 for (tot=0, b=0; b<4; b++)tot+=(A->P)->count[b][a];
8925 best=best_int (5,1, &c, (A->P)->count[0][a],(A->P)->count[1][a],(A->P)->count[2][a],(A->P)->count[3][a],(A->P)->count[4][a]);
8929 fprintf ( stderr, "\nWARNING: POSITION WITH NO INFORMATION [clean_est:%s]", PROGRAM);
8930 A->seq_al[0][a]='-';
8932 else if (((A->P)->count[4][a]*100)/tot >30)A->seq_al[0][a]='-';
8933 else if ( (best*100)/tot<50)A->seq_al[0][a]='n';
8941 char **make_symbols ( char *name, int *n)
8945 symbol=declare_char ( STRING, STRING);
8947 if ( strcmp (name, "3d_ali")==0)
8949 sprintf ( symbol[0], "gih");
8950 sprintf ( symbol[1], "eb");
8951 sprintf ( symbol[2], "x");
8952 sprintf ( symbol[3], "#l");
8956 else if ( strcmp (name, "all")==0)
8959 for ( i=0,a=0; a<26; a++)
8961 sprintf ( symbol[i++], "%c%c", 'a'+a, 'a'+a);
8962 sprintf ( symbol[i++], "%c%c", 'A'+a, 'A'+a);
8964 sprintf ( symbol[i++], "--");
8968 else if ( strcmp (name, "set1")==0)
8970 sprintf ( symbol[0], "ilvmfywhktcagH");
8971 sprintf ( symbol[1], "reqdnsP");
8972 sprintf ( symbol[2], "--");
8973 sprintf ( symbol[3], "#l");
8976 else if ( strcmp (name, "set2")==0)
8979 sprintf ( symbol[n[0]++], "gsacT");
8980 sprintf ( symbol[n[0]++], "ndtvpS");
8981 sprintf ( symbol[n[0]++], "ilkreqL");
8982 sprintf ( symbol[n[0]++], "--");
8983 sprintf ( symbol[n[0]++],"#l");
8985 else if ( strcmp ( name, "any")==0)
8987 sprintf ( symbol[0], "*x");
8997 char *testdna2gene (char *dna)
9002 w=vcalloc(l+1, sizeof (int));
9005 w[a]=isupper (dna[a])?1:-1;
9007 dna=dna2gene (dna,w);
9012 Sequence *dnaseq2geneseq (Sequence *S, int **w)
9018 PS=duplicate_sequence (S);
9019 for (a=0; a<S->nseq; a++)
9021 p=dna2gene (S->seq[a], w[a]);
9022 if (strstr (p, "F"))
9024 HERE ("----FRAMESHIFT: %s", S->name[a]);
9027 PS->len[a]=strlen(p);
9028 PS->seq[a]=vcalloc (PS->len[a]+1, sizeof (char));
9029 sprintf ( PS->seq[a], "%s", p);
9032 PS=reset_sequence_len (PS);
9036 char *dna2gene (char *dna, int *w)
9038 int a, b, c, ns,l,od;
9039 int I1, I2, I3, START, NCE, NCS;
9040 int C1, S1_1, S2_1, S3_1, S4_1,NC1;
9041 int C2, S1_2, S2_2, S3_2, S4_2,NC2;
9042 int C3, S1_3, S2_3, S3_3, S4_3,NC3;
9047 double **C1_mat, **C2_mat;
9048 double *tb, **sc_mat, **tb_mat;
9049 double **em, **trans;
9053 double forbiden =-100000;
9056 double exon_penalty;
9058 double nostop_penalty;
9060 int frameshift_symbol='F';
9066 three_dna=translate_dna_seq_on3frame (dna, 'x', NULL);
9067 lower_string(three_dna);
9069 for (a=0; a<l; a++){max=MAX(max,w[a]);avg_w+=(double)w[a];}
9073 exon_penalty=-100*avg_w;
9075 nostop_penalty=-100 *avg_w;
9076 frameshift1=forbiden;
9077 frameshift2=frameshift1;
9079 out_dna=vcalloc ( 2*strlen (dna)+1, sizeof (char));
9080 sprintf (out_dna, "%s", dna);
9082 START=ns++; I1=ns++;I2=ns++;I3=ns++;NCE=ns++;NCS=ns++;
9084 C1=ns++; S1_1=ns++;S2_1=ns++;NC1=ns++;S3_1=ns++;S4_1=ns++;
9085 C2=ns++; S1_2=ns++;S2_2=ns++;NC2=ns++;S3_2=ns++;S4_2=ns++;
9086 C3=ns++; S1_3=ns++;S2_3=ns++;NC3=ns++;S3_3=ns++;S4_3=ns++;
9089 string=declare_char ( ns+1, 10);
9090 sprintf (string [S1_1], "S1_1");
9091 sprintf (string [S2_1], "S2_1");
9092 sprintf (string [S3_1], "S3_1");
9093 sprintf (string [S4_1], "S4_1");
9095 sprintf (string [S1_2], "S1_2");
9096 sprintf (string [S2_2], "S2_2");
9097 sprintf (string [S3_2], "S3_2");
9098 sprintf (string [S4_2], "S4_2");
9100 sprintf (string [S1_3], "S1_3");
9101 sprintf (string [S2_3], "S2_3");
9102 sprintf (string [S3_3], "S3_3");
9103 sprintf (string [S4_3], "S4_3");
9105 sprintf (string [START], "START");
9106 sprintf (string [NCE], "NCE");
9107 sprintf (string [NCS], "NCS");
9108 sprintf (string [NC1], "NC1");
9109 sprintf (string [NC2], "NC2");
9110 sprintf (string [NC3], "NC3");
9111 sprintf (string [I1], "I1");
9112 sprintf (string [I2], "I2");
9113 sprintf (string [I3], "I3");
9115 sprintf (string [C1], "C1");
9116 sprintf (string [C2], "C2");
9117 sprintf (string [C3], "C3");
9119 sprintf (string [ST], "STOP");
9121 trans=declare_double(ns,ns);
9122 em=declare_double (ns,256);
9123 tb=vcalloc ( l+1, sizeof (double));
9124 sc_mat=declare_double (l+1, ns);
9125 tb_mat=declare_double (l+1, ns);
9126 C1_mat=declare_double (l+1, ns);
9127 C2_mat=declare_double (l+1, ns);
9129 for (a=0; a<ns; a++)
9131 for (b=0; b<ns; b++)trans[a][b]=forbiden;
9132 for (b=0; b<256; b++)em[a][b]=forbiden;
9136 trans[START][NCS]=0;
9138 trans[NCS][NCE]=0;//allow sequence entirely non coding
9146 trans[C1][C3]=frameshift1;
9147 trans[C1][C1]=frameshift2;
9150 trans[C2][C1]=frameshift1;
9151 trans[C2][C2]=frameshift2;
9154 trans[C3][C1]=exon_reward;
9155 trans[C3][C2]=frameshift1;
9156 trans[C3][C3]=frameshift2;
9158 trans[C3][NCE]=nostop_penalty;
9159 trans[C3][ST] =0; //normal termination
9163 trans[C1][S1_1]=exon_penalty;
9164 trans[S1_1][S2_1]=0;
9168 trans[S3_1][S4_1]=0;
9170 trans[S4_1][C3]=frameshift1;
9171 trans[S4_1][C1]=frameshift2;
9174 trans[C2][S1_2]=exon_penalty;
9175 trans[S1_2][S2_2]=0;
9179 trans[S3_2][S4_2]=0;
9181 trans[S4_2][C1]=frameshift1;
9182 trans[S4_2][C2]=frameshift2;
9184 trans[C3][S1_3]=exon_penalty;
9185 trans[S1_3][S2_3]=0;
9189 trans[S3_3][S4_3]=0;
9191 trans[S4_3][C2]=frameshift1;
9192 trans[S4_3][C3]=frameshift2;
9214 for (a=0; a<ns; a++)sc_mat[0][a]=tb_mat[0][a]=forbiden;
9215 sc_mat[0][START]=tb_mat[0][START]=0;
9217 for (a=1; a<=l ;a++)
9220 r=toupper (dna[a-1]);
9221 for (b=0; b<ns; b++)
9223 double best_sc,e,lw;
9226 lw=(double)w[a-1]+shiftw;
9228 if (b==ST && three_dna[a-1]=='x')e=0;
9229 else if (b==C1 || b == C2 || b== C3)e=lw;
9230 else if ( b==NC1|| b==NC2 || b==NC3 || b==NCE || b==NCS)e=-lw;
9235 sc_mat[a][b]=forbiden;
9240 for (best_sc=forbiden,best_t=0,c=0; c<ns; c++)
9242 double tr, sc, p_sc;
9244 p_sc=sc_mat[a-1][c];
9246 //Frameshift handling
9248 if ( tr== forbiden || p_sc==forbiden);
9249 else if (tr!=forbiden)
9251 if (b==C2 && c!=C1 && c!=S4_1){p_C1='N'; p_C2=r; }
9252 else if(b==C3 && c!=C2 && c!=S4_2){p_C1='N'; p_C2='N';}
9255 p_C1=C1_mat[a-1][c];
9256 p_C2=C2_mat[a-1][c];
9259 if (b==C3 && is_stop (p_C1, p_C2,r)){tr=forbiden;}
9263 if (c==0 || sc>best_sc)
9272 C1_mat[a][b]=(b==C1)?r:C1_mat[a-1][best_t];
9273 C2_mat[a][b]=(b==C2)?r:C2_mat[a-1][best_t];
9274 sc_mat[a][b]=best_sc;
9275 tb_mat [a][b]=best_t;
9287 // HERE ("**%d [%s] %c in %d", b,string[b], dna[a-1], a);
9301 if ( st || t==ST)st++;
9302 coding=(t==C1 || t==C2 || t==C3 || t==I1 || t==I2 || t==I3 ||(st && st<=3))?1:0;
9306 if (t==C1 && (pt==C2 || pt==S4_2)){out_dna[od++]=frameshift_symbol;}
9307 else if (t==C1 && (pt==C1 || pt==S4_1)){out_dna[od++]=frameshift_symbol;out_dna[od++]=frameshift_symbol;}
9309 else if (t==C2 && (pt==C3 || pt==S4_3)){out_dna[od++]=frameshift_symbol;}
9310 else if (t==C2 && (pt==C2 || pt==S4_2)){out_dna[od++]=frameshift_symbol;out_dna[od++]=frameshift_symbol;}
9312 else if (t==C3 && (pt==C1 || pt==S4_1)){out_dna[od++]=frameshift_symbol;}
9313 else if (t==C3 && (pt==C3 || pt==S4_3)){out_dna[od++]=frameshift_symbol;out_dna[od++]=frameshift_symbol;}
9315 if (coding)out_dna[od++]=toupper (r);
9316 else out_dna[od++]=tolower(r);
9320 free_double (tb_mat, -1);
9321 free_double (sc_mat, -1);
9322 free_double (trans, -1);
9323 free_double (em, -1);
9324 free_double (C1_mat, -1);
9325 free_double (C2_mat, -1);
9332 int res_weights2avg(Sequence *R, int **w)
9338 for (a=0; a<R->nseq; a++)
9339 for (b=0; b<R->len[a]; b++){avg+=w[a][b];n++;}
9342 int res_weights2min(Sequence *R, int **w)
9347 for (a=0; a<R->nseq; a++)
9348 for (b=0; b<R->len[a]; b++)min=MIN(min,(w[a][b]));
9351 int res_weights2max(Sequence *R, int **w)
9356 for (a=0; a<R->nseq; a++)
9357 for (b=0; b<R->len[a]; b++)max=MAX(max,(w[a][b]));
9360 int scan_res_weights4ac (Sequence *R, int **w, int start, int end, int step)
9368 avg=res_weights2avg(R,w);
9370 for (a=start; a<=end; a+=step)
9373 count=res_weights2accuracy_counts (R,w,a,NULL);
9374 acc=counts2accuracy (count);
9381 vfree (count);vfree (acc);
9383 count=res_weights2accuracy_counts (R,w,best_t,NULL);
9384 acc=counts2accuracy (count);
9385 fprintf (stderr, "\nBest_T: %d ", best_t);
9386 display_accuracy (count,stderr);
9388 count=res_weights2accuracy_counts (R,w,2*avg,NULL);
9389 acc=counts2accuracy (count);
9390 fprintf (stderr, " Avg_T: %d ", 2*avg);
9391 display_accuracy (count,stderr);
9393 vfree (acc); vfree (count);
9396 int ** shift_res_weights ( Sequence *R, int **w, int shift)
9399 for (a=0; a<R->nseq; a++)
9400 for (b=0; b<R->len[a]; b++)
9404 float *res_weights2accuracy_counts ( Sequence *R, int **w,int T, float *result)
9406 int a, b, coding,pcoding;
9408 if (!result)result=vcalloc (4, sizeof (float));
9410 for (a=0; a<R->nseq; a++)
9412 for (b=0; b<R->len[a]; b++)
9414 coding=(isupper(R->seq[a][b]))?1:0;
9415 pcoding=(w[a][b]>T)?1:0;
9416 if ( coding && pcoding)result[0]++;//TP
9417 else if ( !coding && !pcoding)result[1]++;//TN
9418 else if ( !coding && pcoding)result[2]++;//FP
9419 else if ( coding && !pcoding)result[3]++;//FN
9425 void genepred_seq2accuracy_counts4all (Sequence *R, Sequence *T)
9428 float *result =vcalloc (4, sizeof (float));
9430 fprintf ( stderr, "\n");
9432 for (a=0; a<R->nseq; a++)
9434 fprintf ( stderr, "gene: %s ", R->name[a]);
9435 for (b=0; b<T->nseq; b++)
9437 if ( strm (R->name[a], T->name[b]) && hasupper(R->seq[a]))
9439 vfree (display_accuracy (genepred2accuracy_counts (R->seq[a], T->seq[b], NULL),stderr));
9447 float* genepred_seq2accuracy_counts (Sequence *R, Sequence *T,float *result)
9451 if (!result)result=vcalloc (4, sizeof (float));
9453 for (a=0; a<R->nseq; a++)
9454 for (b=0; b<T->nseq; b++)
9455 if ( strm (R->name[a], T->name[b]) && hasupper(R->seq[a]))
9456 genepred2accuracy_counts (R->seq[a], T->seq[b], result);
9460 float* genepred2accuracy_counts (char *ref, char *target , float *result)
9462 char *ref2, *target2;
9464 if ( !result) result=vcalloc (4, sizeof (float));
9465 ref2=vcalloc ( strlen (ref)+1, sizeof (char));
9466 sprintf ( ref2, "%s", ref);
9468 target2=vcalloc ( strlen (target)+1, sizeof (char));
9469 sprintf ( target2, "%s", target);
9471 remove_charset (ref2, "Ff");
9472 remove_charset (target2, "Ff");
9474 if ( strlen (target2) != strlen (ref2))
9475 {fprintf (stderr, "ERROR: Gene and target have different length [FATAL]\n"); myexit (EXIT_FAILURE);}
9480 int coding, pcoding;
9481 coding =isupper (ref2[a]);
9482 pcoding=isupper (target2[a]);
9483 if ( coding && pcoding)result[0]++;//TP
9484 else if ( !coding && !pcoding)result[1]++;//TN
9485 else if ( !coding && pcoding)result[2]++;//FP
9486 else if ( coding && !pcoding)result[3]++;//FN
9494 int is_stop( char r1, char r2, char r3)
9498 if (!r2 || !r3) return 0;
9499 else if (tolower (r1)=='n' || tolower(r2)=='n' || tolower(r3)=='n') return 0;
9502 sprintf (codon, "%c%c%c", tolower(r1), tolower(r2), tolower(r3));
9503 if (translate_dna_codon (codon, 'x')=='x')return 1;
9509 char * translate_dna_seq_on3frame ( char *dna_seq, char stop, char *prot)
9515 if ( prot==NULL)prot=vcalloc ( l+2, sizeof (char));
9517 buf=vcalloc (l+4, sizeof (char));
9518 sprintf (buf, "%s", dna_seq);
9519 lower_string ( buf);
9520 for ( a=0; a< l; a++)buf[a]=(buf[a]=='t')?'u':buf[a];
9522 for (a=0; a< l; a++)
9523 prot[a]=translate_dna_codon (buf+a, stop);
9529 char * translate_dna_seq ( char *dna_seq, int frame, char stop, char *prot)
9536 if ( prot==NULL)prot=vcalloc ( l, sizeof (char));
9538 buf=vcalloc (l+4, sizeof (char));
9539 sprintf (buf, "%s", dna_seq);
9540 lower_string ( buf);
9541 for ( a=0; a< l; a++)buf[a]=(buf[a]=='t')?'u':buf[a];
9544 for ( b=0,a=0+frame; a< l; a+=3,b++)
9547 prot[b]=translate_dna_codon (buf+a, stop);
9554 char * back_translate_dna_codon ( char aa, int deterministic)
9560 if ( r==NULL)r=vcalloc (4, sizeof (char));
9562 if (!is_gap(aa))aa=tolower(aa);
9564 if (is_gap(aa))sprintf (r, "---");
9565 else if ( aa>=0 && aa<=9)
9567 sprintf (r, "%d%d%d", aa, aa,aa);
9569 else if ( aa>='0' && aa<='9')
9571 sprintf (r, "%c%c%c", aa, aa,aa);
9575 choice=(deterministic)?0:rand()%4;
9576 if ( choice==0)sprintf (r, "gca");
9577 else if ( choice==1)sprintf (r, "gcg");
9578 else if ( choice==2)sprintf (r, "gcc");
9579 else if ( choice==3)sprintf (r, "gct");
9583 choice=(deterministic)?0:rand()%2;
9584 if ( choice==0)sprintf (r, "tgc");
9585 else if ( choice==1)sprintf (r, "tgt");
9589 choice=(deterministic)?0:rand()%2;
9590 if ( choice==0)sprintf (r, "gac");
9591 else if ( choice==1)sprintf (r, "gat");
9596 choice=(deterministic)?0:rand()%2;
9597 if ( choice==0)sprintf (r, "gaa");
9598 else sprintf (r, "gag");
9602 choice=(deterministic)?0:rand()%2;
9603 if ( choice==0)sprintf (r, "ttc");
9604 else sprintf (r, "ttt");
9608 choice=(deterministic)?0:rand()%4;
9609 if ( choice==0) sprintf (r, "gga");
9610 else if ( choice==1) sprintf (r, "ggg");
9611 else if ( choice==2) sprintf (r, "ggc");
9612 else if ( choice==3) sprintf (r, "ggt");
9617 if ( choice==0)sprintf (r, "cac");
9618 else sprintf (r, "cat");
9622 choice=(deterministic)?0:rand()%3;
9623 if ( choice==0) sprintf (r, "ata");
9624 else if ( choice==1) sprintf (r, "atc");
9625 else if ( choice==2) sprintf (r, "att");
9629 choice=(deterministic)?0:rand()%2;
9630 if ( choice==0) sprintf (r, "aaa");
9631 else if ( choice==1) sprintf (r, "aag");
9636 choice=(deterministic)?0:rand()%6;
9637 if ( choice==0) sprintf (r, "cta");
9638 else if ( choice==1) sprintf (r, "ctg");
9639 else if ( choice==2) sprintf (r, "ctc");
9640 else if ( choice==3) sprintf (r, "ctt");
9641 else if ( choice==4) sprintf (r, "tta");
9642 else if ( choice==5) sprintf (r, "ttg");
9644 else if ( aa=='m')sprintf ( r, "atg");
9647 choice=(deterministic)?0:rand()%2;
9648 if ( choice==0) sprintf (r, "aac");
9649 else if ( choice==1) sprintf (r, "aat");
9653 choice=(deterministic)?0:rand()%4;
9654 if ( choice==0) sprintf (r, "cca");
9655 else if ( choice==1) sprintf (r, "ccg");
9656 else if ( choice==2) sprintf (r, "ccc");
9657 else if ( choice==3) sprintf (r, "cct");
9661 choice=(deterministic)?0:rand()%2;
9662 if ( choice==0) sprintf (r, "caa");
9663 else if ( choice==1) sprintf (r, "cag");
9667 choice=(deterministic)?0:rand()%6;
9668 if ( choice==0) sprintf (r, "cga");
9669 else if ( choice==1) sprintf (r, "cgg");
9670 else if ( choice==2) sprintf (r, "cgc");
9671 else if ( choice==3) sprintf (r, "cgt");
9672 else if ( choice==4) sprintf (r, "aga");
9673 else if ( choice==5) sprintf (r, "agg");
9678 choice=(deterministic)?0:rand()%6;
9679 if ( choice==0) sprintf (r, "tca");
9680 else if ( choice==1) sprintf (r, "tcg");
9681 else if ( choice==2) sprintf (r, "tcc");
9682 else if ( choice==3) sprintf (r, "tct");
9683 else if ( choice==4) sprintf (r, "agt");
9684 else if ( choice==5) sprintf (r, "agc");
9689 choice=(deterministic)?0:rand()%4;
9690 if ( choice==0) sprintf (r, "aca");
9691 else if ( choice==1) sprintf (r, "acg");
9692 else if ( choice==2) sprintf (r, "acc");
9693 else if ( choice==3) sprintf (r, "act");
9697 choice=(deterministic)?0:rand()%4;
9698 if ( choice==0) sprintf (r, "gta");
9699 else if ( choice==1) sprintf (r, "gtg");
9700 else if ( choice==2) sprintf (r, "gtc");
9701 else if ( choice==3) sprintf (r, "gtt");
9709 choice=(deterministic)?0:rand()%2;
9710 if ( choice==0) sprintf (r, "tac");
9711 else if ( choice==1) sprintf (r, "tat");
9720 int translate_dna_codon ( char *sequence, char stop)
9727 if ( strlen (sequence)<1) return 'n';
9728 upper=isupper (sequence[0])?1:0;
9730 if ( (b=strlen (sequence))<3)
9734 seq[0]=tolower(sequence[0]);
9735 seq[1]=tolower(sequence[1]);
9736 seq[2]=tolower(sequence[2]);
9739 seq[0]=(seq[0]=='u')?'t':seq[0];
9740 seq[1]=(seq[1]=='u')?'t':seq[1];
9741 seq[2]=(seq[2]=='u')?'t':seq[2];
9742 if ( strm5(seq, "gca", "gcg", "gcc", "gct","gcn"))ret='a';
9743 else if ( strm2(seq, "tgc","tgt"))ret='c';
9744 else if ( strm2(seq, "gac","gat"))ret='d';
9745 else if ( strm2(seq, "gaa","gag"))ret='e';
9746 else if ( strm2(seq, "ttc","ttt"))ret='f';
9747 else if ( strm5(seq, "gga","ggg","ggc", "ggt", "ggn"))ret='g';
9748 else if ( strm2(seq, "cac","cat"))ret='h';
9749 else if ( strm3(seq, "ata","atc","att"))ret='i';
9750 else if ( strm2(seq, "aaa","aag"))ret= 'k';
9751 else if ( strm6(seq, "cta","ctg","ctc", "ctt", "tta", "ttg"))ret='l';
9752 else if ( strm (seq, "ctn"))ret='l';
9753 else if ( strm (seq, "atg"))ret='m';
9754 else if ( strm2(seq, "aac","aat"))ret= 'n';
9755 else if ( strm5(seq, "cca","ccg","ccc", "cct","ccn"))ret='p';
9756 else if ( strm2(seq, "cag","caa"))ret='q';
9757 else if ( strm6(seq, "cga","cgg","cgc", "cgt","aga","agg"))ret='r';
9758 else if ( strm (seq, "cgn"))ret= 'r';
9759 else if ( strm6(seq, "tca","tcg","tcc", "tct","agc","agt"))ret='s';
9760 else if ( strm (seq, "ccn"))ret='s';
9761 else if ( strm5(seq, "aca","acg","acc", "act", "acn"))ret='t';
9762 else if ( strm5(seq, "gta","gtg","gtc", "gtt", "gtn"))ret='v';
9763 else if ( strm (seq, "tgg"))ret='w';
9764 else if ( strm2(seq, "tac","tat"))ret='y';
9765 else if ( strm3(seq, "tag","taa","tga"))ret=stop;
9768 ret= (upper)?toupper(ret):ret;
9773 int extend_seqaln (Sequence *S, Alignment *A)
9777 if (S){s=S->seq;n=S->nseq;}
9778 else if (A){s=A->seq_al;n=A->nseq;}
9781 for (a=0; a<n;a++){extend_seq(s[a]);}
9784 int unextend_seqaln (Sequence *S, Alignment *A)
9788 if (S){s=S->seq;n=S->nseq;}
9789 else if (A){s=A->seq_al;n=A->nseq;}
9792 for (a=0; a<n;a++){unextend_seq(s[a]);}
9797 char *extend_seq (char *seq)
9800 int l, lb, a, b, upper,v;
9804 buf =vcalloc ( l+1, sizeof (char));
9805 ebuf=vcalloc ( l+1, sizeof (char));
9806 sprintf ( buf, "%s", seq);
9807 sprintf ( ebuf, "%s", seq);
9813 for (a=0; a<lb-1; a++)
9818 upper=(isupper(r1))?1:0;
9822 r1=(r1=='u')?'t':r1;
9823 r2=(r2=='u')?'t':r2;
9825 if (r1=='x' || r1=='n')v='x';
9826 else if (r2=='n' || r2=='x')v=r1;
9828 else if (r1=='a' && r2=='a')v='d';
9829 else if (r1=='a' && r2=='c')v='e';
9830 else if (r1=='a' && r2=='g')v='f';
9831 else if (r1=='a' && r2=='t')v='h';
9833 else if (r1=='c' && r2=='a')v='i';
9834 else if (r1=='c' && r2=='c')v='k';
9835 else if (r1=='c' && r2=='g')v='l';
9836 else if (r1=='c' && r2=='t')v='m';
9838 else if (r1=='g' && r2=='a')v='n';
9839 else if (r1=='g' && r2=='c')v='p';
9840 else if (r1=='g' && r2=='g')v='q';
9841 else if (r1=='g' && r2=='t')v='r';
9843 else if (r1=='t' && r2=='a')v='s';
9844 else if (r1=='t' && r2=='c')v='v';
9845 else if (r1=='t' && r2=='g')v='w';
9846 else if (r1=='t' && r2=='t')v='y';
9851 ebuf[a]=(upper)?toupper(v):v;
9854 for (b=0,a=0; a<l; a++)
9856 if ( !is_gap(seq[a]))seq[a]=ebuf[b++];
9862 char *unextend_seq (char *seq)
9865 int l, lb, a, b, upper,v;
9869 buf =vcalloc ( l+1, sizeof (char));
9870 ebuf=vcalloc ( l+1, sizeof (char));
9871 sprintf ( buf, "%s", seq);
9872 sprintf ( ebuf, "%s", seq);
9878 for (a=0; a<lb-1; a++)
9881 upper=(isupper(r1))?1:0;
9883 r1=(r1=='u')?'t':r1;
9886 else if (r1=='d' || r1=='e' || r1 == 'f' || r1 == 'h')v='a';
9887 else if (r1=='i' || r1=='k' || r1 == 'l' || r1 == 'm')v='c';
9888 else if (r1=='n' || r1=='p' || r1 == 'q' || r1 == 'r')v='g';
9889 else if (r1=='s' || r1=='v' || r1 == 'w' || r1 == 'y')v='t';
9892 ebuf[a]=(upper)?toupper(v):v;
9895 for (b=0,a=0; a<l; a++)
9897 if ( !is_gap(seq[a]))seq[a]=ebuf[b++];
9906 Alignment * mutate_aln ( Alignment *A, char *r)
9908 int a, b, c, mut,type, ratio;
9916 if ( r[0]=='\0')ratio=0.01*RAND_MAX;
9917 else ratio=atof(r)*RAND_MAX;
9920 S=get_sequence_type(S);
9924 if ( strm(S->type, "DNA") || strm(S->type, "RNA"))sprintf (alp, "AGCT");
9925 else if ( strm(S->type, "PROTEIN"))sprintf (alp, "ACDEFGHIKLMNPQRSTVWY");
9927 alp_size=strlen(alp);
9929 B=copy_aln (A,NULL);
9930 B=realloc_aln(B, B->len_aln*2+1);
9932 for ( a=0, b=0; a< A->len_aln; a++, b+=2)
9934 for ( c=0; c< A->nseq; c++)
9936 B->seq_al[c][b]=tolower(A->seq_al[c][a]);
9937 B->seq_al[c][b+1]='~';
9941 for ( c=0; c< A->nseq; c++)B->seq_al[c][b]='\0';
9942 B->len_aln=A->len_aln*2;
9947 for (a=0; a< B->len_aln; a+=2)
9948 for ( b=0; b<B->nseq; b++)
9950 if ( is_gap(B->seq_al[b][a]))continue;
9951 mut=((rand()%RAND_MAX)>ratio)?0:1;
9958 if (type==0)/*deletion*/
9960 B->seq_al[b][a]='.';
9964 B->seq_al[b][a+1]=alp[rand()%alp_size];
9968 B->seq_al[b][a]=alp[rand()%alp_size];
9976 free_sequence (S, S->nseq);
9982 char* mutate_amino_acid ( char aa, char *mode)
9986 char nucleotide[]="agct";
9987 char amino_acid[]="acdefghiklmnpqrstvwy";
9988 static char **triplet;
9989 static char **cw_col;
9991 static int **amino_acid_list;
9998 if ( !mode)sprintf (mat, "clustalw_col");
9999 else sprintf (mat, "%s", mode);
10002 triplet=declare_char ( 64, 4);
10003 for (d=0, a=0; a< 4;a++)
10004 for ( b=0; b< 4; b++)
10005 for ( c=0; c< 4; c++, d++)
10007 triplet[d][0]=nucleotide[a];
10008 triplet[d][1]=nucleotide[b];
10009 triplet[d][2]=nucleotide[c];
10012 if ( !cw_col)cw_col=make_group_aa ( &ng_cw_col,mat);
10013 if ( !amino_acid_list)
10015 amino_acid_list=declare_int ( 20, 65);
10016 for ( a=0; a< 20; a++)
10017 for ( b=0; b< 64; b++)
10019 a1=translate_dna_codon ( triplet[b], 'x');
10021 for ( d=0; d< ng_cw_col; d++)
10022 if ( is_in_set ( a1, cw_col[d]) && is_in_set ( a2, cw_col[d]))
10024 amino_acid_list[a][++amino_acid_list[a][0]]=b;
10027 lu=vcalloc ( 26, sizeof (int));
10028 for ( a=0; a<20; a++)
10030 lu[amino_acid[a]-'a']=a;
10033 for ( a=0; a< 20; a++)
10035 fprintf ( stderr, "\n%c", amino_acid[a]);
10036 for ( b=1; b<=amino_acid_list[a][0]; b++)
10037 fprintf ( stderr, "\n\t%s %c", triplet[amino_acid_list[a][b]], translate_dna_codon (triplet[amino_acid_list[a][b]], 'x'));
10042 return triplet [addrand((unsigned long)amino_acid_list[lu[aa-'a']][0])+1];
10045 /**************************************************************************************************/
10046 /******************************** ********************************************/
10047 /******************************** PROCESSING ********************************************/
10048 /******************************** ********************************************/
10052 void modify_data (Sequence_data_struc *D1in, Sequence_data_struc *D2in, Sequence_data_struc *DSTin, char **action_list,int n_actions, Action_data_struc *RAD)
10054 Sequence *COOR=NULL, *NS=NULL,*BUFS=NULL, *OUT_S=NULL;
10055 Constraint_list *CL;
10057 int value,upper_value, lower_value, start, end, a, b,c;
10058 int *count_table=NULL;
10060 Sequence_data_struc *D1;
10061 Sequence_data_struc *D2;
10062 Sequence_data_struc *DST;
10063 int s1, s2, r1, r2;
10064 static int clean_flag;
10069 action=action_list[0];
10071 if (action[0]=='2')
10079 else if ( action[0]=='1')
10086 else if ( action[0]=='3')
10099 if (!D1->A)D1->A=copy_aln (D1in->A, NULL);
10101 if ( strm(action, "seqnos"))
10103 (D1->A)->output_res_num=1;
10105 else if ( strm (action,"aln2bootstrap"))
10107 (D1->A)=aln2bootstrap (D1->A, ATOI_ACTION (1));
10108 D1->S=aln2seq (D1->A);
10110 else if ( strm (action,"aln2sample"))
10112 (D1->A)=aln2sample (D1->A, ATOI_ACTION (1));
10113 D1->S=aln2seq (D1->A);
10115 else if ( strm (action,"aln2random_aln"))
10117 (D1->A)=aln2random_aln (D1->A, ACTION (1));
10118 D1->S=aln2seq (D1->A);
10120 else if ( strm (action, "or_scan"))
10123 D1->A=or_scan(D1->A, D2->A, ACTION(1));
10124 D1->S=aln2seq (D1->A);
10126 else if ( strm (action, "or_sar"))
10128 D1->A=or_sar(D1->A, D2->A, ACTION(1), PRINT);
10129 D1->S=aln2seq (D1->A);
10131 else if ( strm ( action, "sar2subsar"))
10136 Alignment *subA, *subS;
10140 fprintf ( stderr, "\nin=aln, in2=sar sar2subsar [filter value compound1 compound2...] | [jack1] | [file]\n");
10141 myexit (EXIT_FAILURE);
10144 sarset2subsarset ( D1->A, D2->A, &subA, &subS, main_read_aln (action_list[2], NULL));
10145 D1->A=subA;D2->A=subS;
10147 else if ( strm (action, "display_sar"))
10149 D1->A=display_sar (D1->A, D2->A, action_list[1]);
10151 else if ( strm ( action, "sar2simpred"))
10156 sar2simpred ( D1->A, D2->A, action_list[1], action_list[2], atoi(action_list[3]), atoi (action_list[4]));
10158 else if ( strm ( action, "sar2simpred2"))
10165 fprintf ( stderr, "\nERROR: +sar2simpred2 seqnamesfile posfile compound limit");
10166 myexit (EXIT_FAILURE);
10168 sar2simpred2 ( D1->A, D2->A, action_list[1], action_list[2], action_list[3], atoi (action_list[4]));
10170 else if ( strm ( action, "sar_analyze"))
10175 sar_analyze ( D1->A, D2->A,action_list[1]);
10177 else if ( strm ( action, "simple_sar_predict"))
10179 //displays each column with ist score;
10180 simple_sar_predict (D1->A, D2->A,ACTION(1));
10181 myexit (EXIT_SUCCESS);
10183 else if ( strm ( action, "display_sar_analyze"))
10185 //displays each column with ist score;
10186 display_simple_sar_analyze_col (D1->A, D2->A,ACTION(1));
10187 myexit (EXIT_SUCCESS);
10189 else if ( strm ( action, "display_sar_analyze_pc"))
10191 //displays each column with ist score;
10192 display_simple_sar_analyze_pair_col (D1->A, D2->A,ACTION(1));
10193 myexit (EXIT_SUCCESS);
10195 else if ( strm ( action, "weight2sar"))
10202 fprintf ( stderr, "\nERROR: +weight2sar <weight_file> <limit>");
10203 myexit (EXIT_FAILURE);
10205 D1->A=weight2sar ( D1->A,D2->A, action_list[1], atoi(action_list[2]));
10208 else if ( strm ( action, "sar_weight"))
10215 fprintf ( stderr, "\nERROR: +sar_weight <sar_analyze> <compound>");
10216 myexit (EXIT_FAILURE);
10218 D1->A=aln2weighted_sar_score ( D1->A,D2->A, action_list[1], action_list[2]);
10219 D1->S=aln2seq ( D1->A);
10222 else if ( strm (action, "name2unique_name"))
10225 char command[1000];
10226 tmp1=vtmpnam (NULL); tmp2=vtmpnam (NULL);
10228 output_fasta_aln (tmp1,D1->A);
10229 free_aln (D1->A);free_sequence (D1->S, -1);
10230 sprintf ( command, "fasta_aln2fasta_aln_unique_name.pl %s >%s", tmp1, tmp2);
10231 my_system ( command);
10232 D1->S=get_fasta_sequence ( tmp2, NULL);
10233 D1->A=seq2aln (D1->S,NULL, 1);
10235 else if ( strm (action, "rm_tag") || strm (action, "rm_template"))
10238 char **temp_name=NULL,**temp_list=NULL, temp_nseq=0;
10241 if ( D1 && D1->A){temp_name=(D1->A)->name;temp_nseq=(D1->A)->nseq;}
10242 else if ( D1 && D1->S){temp_name=(D1->S)->name;temp_nseq=(D1->S)->nseq;}
10243 temp_list=rm_name_tag (temp_name,temp_nseq, NULL);
10244 if ( n_actions>1 && strm (action_list[1], "template"))
10247 for ( z=0; z<temp_nseq; z++)
10249 if (temp_list[z][0])
10250 {fprintf (stdout, "%s\n", temp_list[z]);}
10252 myexit (EXIT_SUCCESS);
10255 else if (strm (action, "add_template") || strm (action, "swap_header"))
10257 D1->S=seq2template_seq (D1->S, action_list[1], NULL);
10258 D1->A=seq2aln(D1->S, NULL, 1);
10260 else if ( strm ( action, "seq2year"))
10262 D1->S=seq2year (D1->S, (n_actions>1)?atoi(action_list[1]):1);
10263 D1->A=seq2aln(D1->S, NULL, 1);
10265 else if ( strm (action, "swap_lib_header"))
10268 S=main_read_seq (action_list[1]);
10272 else if ( strm (action, "weight_lib"))
10276 w=atoi (action_list[1]);
10280 Sequence *S=(D1->CL)->S;
10281 int ***r=(D1->CL)->residue_index;
10283 for (s1=0; s1<S->nseq; s1++)
10284 for (r1=1; r1<=S->len[s1]; r1++)
10285 for (b=1; b<r[s1][r1][0]; b+=3)
10291 else if ( strm (action, "struc2nb"))
10294 for ( c=0; c< (D1->S)->nseq; c++)
10296 struclist2nb ((D1->S)->name[c],(D1->S)->seq[c], (D1->S)->seq_comment[c], atof(action_list[1]),ACTION(2),ACTION(3) );
10298 myexit (EXIT_SUCCESS);
10303 else if ( strm(action, "seq2contacts"))
10306 D1->S=swap_header (D1->S, D2->S);
10307 for ( z=0; z< (D1->S)->nseq; z++)sprintf ( (D1->A)->name[z], "%s", (D1->S)->name[z]);
10308 DST->S=seq2contacts (D1->S, atof (action_list[1]));
10309 DST->A=copy_aln (D1->A, NULL);
10310 thread_seq_struc2aln ( DST->A,DST->S);
10311 for (z=0; z< (D1->S)->nseq; z++)
10315 else if ( strm(action, "struc2contacts"))
10318 if ( atof (action_list[3])>0)
10320 seq=map_contacts (action_list[1], action_list[2], atof (action_list[3]));
10321 fprintf ( stderr, "\n>%s %s\n%s",action_list[1], action_list[2],seq);
10324 print_contacts (action_list[1], action_list[2], atof (action_list[3]));
10326 myexit (EXIT_SUCCESS);
10328 else if ( strm(action, "treelist_prune")|| strm(action, "prune_treelist"))
10331 if (D2 && D2->S)TS=D2->S;
10332 else TS=treelist2sub_seq((D1->S),ATOI_ACTION(1));
10333 treelist2prune_treelist ( D1->S,TS, NULL);
10334 D1->A=seq2aln (D1->S, NULL, NO_PAD);
10336 else if ( strm (action, "tree2unresolved_nodes"))
10340 ns=tree2nseq (D1->T);
10341 l=vcalloc (ns, sizeof (int));
10342 tree2nnode_unresolved (D1->T, l);
10343 for ( a=0; a<ns; a++)if (l[a])fprintf ( stdout, "SIZE: %d COUNT: %d\n", a, l[a]);
10345 myexit (EXIT_SUCCESS);
10347 else if ( strm(action, "tree_prune") || strm(action, "prune_tree"))
10349 D1->T=main_prune_tree ( D1->T, D2->S);
10351 else if ( strm ( action, "tree2seq"))
10353 D1->S=tree2seq(D1->T, NULL);
10354 D1->A=seq2aln (D1->S, D1->A, 1);
10355 (D1->A)->len_aln=1;
10356 for ( a=0; a< (D1->A)->nseq; a++)sprintf ( (D1->A)->seq_al[a], "sequence");
10358 else if ( strm (action, "seq2dpatree"))
10360 D1->T= seq2dpa_tree(D1->S,"ktup");
10362 else if ( strm (action, "tree2dpatree"))
10364 D1->T= tree2dpa_tree(D1->T,(D2 && D2->A)?D2->A:D1->A, (n_actions==1)?"idmat":action_list[1]);
10366 else if ( strm (action, "tree2group"))
10368 vfclose (tree2group (D1->T, (tree2seq(D1->T,NULL)), atoi(action_list[1]), atoi(action_list[2]),(n_actions==4)?action_list[3]:NULL, stdout));
10369 myexit (EXIT_SUCCESS);
10371 else if ( strm(action, "unroot"))
10373 D1->T=unroot_tree(D1->T);
10377 else if ( strm(action, "treelist2group")|| strm(action, "treelist2groups") )
10381 if (D2 && D2->S)TS=D2->S;
10382 else TS=treelist2seq((D1->S));
10383 treelist2groups (D1->S, TS, ACTION(1), stdout);
10384 myexit (EXIT_SUCCESS);
10386 // treelist2groups (D1->S,(D2)?D2->S:NULL, ACTION(1), stdout );
10387 //exit (EXIT_SUCCESS);
10389 else if ( strm(action, "splits2tree"))
10392 D1->T=split2tree ((D2)?D2->T:NULL,D1->S, ACTION(1));
10395 else if ( strm(action, "count_splits"))
10398 count_splits ((D2)?D2->T:NULL,D1->S, ACTION(1));
10399 myexit (EXIT_SUCCESS);
10401 else if ( strm(action, "count_groups"))
10403 count_tree_groups (D1->S, ACTION(1));
10405 else if ( strm (action, "tree2dist"))
10410 TS=(D2)?D2->S:NULL;
10411 td=tree2dist (D1->T,TS, NULL);
10412 if (!TS)TS=tree2seq(D1->T, NULL);
10413 for (ta=0; ta<TS->nseq; ta++)
10415 fprintf ( stdout, "%-15s ",TS->name[ta]);
10416 for ( tb=0; tb<TS->nseq; tb++)
10419 if ( ACTION(1) && strm (ACTION(1), "length"))n=1;
10421 fprintf (stdout, " %4d", td [n][ta][tb]);
10423 fprintf ( stdout, "\n");
10425 myexit (EXIT_SUCCESS);
10427 else if ( strm (action, "treelist2lti"))
10430 if (D2 && D2->S)TS=D2->S;
10431 else TS=treelist2sub_seq((D1->S),ATOI_ACTION(2));
10432 treelist2lti (D1->S,TS, (int)ATOI_ACTION(1), stdout );
10435 else if ( strm (action,"treelist2frame"))
10438 if (D2 && D2->S)TS=D2->S;
10439 else TS=treelist2sub_seq((D1->S),ATOI_ACTION(1));
10440 treelist2frame (D1->S, TS);
10441 myexit (EXIT_SUCCESS);
10444 else if ( strm (action, "treelist2seq"))
10446 D1->S=treelist2sub_seq (D1->S,ATOI_ACTION(1));
10447 D1->A=seq2aln(D1->S, NULL, 1);
10449 else if ( strm (action, "treelist2leafgroup"))
10451 treelist2leafgroup (D1->S, (D2)?D2->S:NULL, ACTION(1));
10454 else if ( strm(action, "treelist2splits"))
10456 if (D1->T)D1->S=add_file2file_list ((D1->T)->file, NULL);
10457 treelist2splits (D1->S, (D2)?D2->S:NULL);
10460 else if ( strm(action, "treelist2dmat"))
10462 treelist2dmat (D1->S);
10464 else if ( strm(action, "tree2node") )
10466 print_node_list ( D1->T,(DST)?DST->S:NULL);
10467 myexit (EXIT_SUCCESS);
10469 else if ( strm(action, "tree_cmp_list") )
10471 D1->T=main_compare_trees_list ( D1->T, D2->S, stdout);
10473 else if ( strm(action, "tree_cmp") || strm (action, "tree_compare"))
10475 D1->T=main_compare_trees ( D1->T, D2->T, stdout);
10477 else if ( strm (action, "tree_scan"))
10479 D1->T=tree_scan (D1->A, D2->T, ACTION(1), ACTION(2));
10481 else if ( strm (action, "split_cmp"))
10483 main_compare_splits (D1->T, D2->T, ACTION(1), stdout);
10486 else if ( strm(action, "node_sort"))
10488 node_sort ( action_list[1], D1->T);
10489 myexit (EXIT_SUCCESS);
10492 else if ( strm ( action, "avg_bootstrap"))
10494 display_avg_bootstrap (D1->T);
10495 myexit (EXIT_SUCCESS);
10497 else if ( strm (action, "genepred2acc"))
10501 vfree (display_accuracy (genepred_seq2accuracy_counts (D2->S, D1->S, NULL),stderr));
10502 myexit (EXIT_SUCCESS);
10504 else if ( strm (action, "tree_cog_cmp"))
10506 main_compare_cog_tree (D1->T,action_list[1]);
10507 myexit (EXIT_SUCCESS);
10509 else if ( strm (action, "tree_aln_cmp"))
10511 main_compare_aln_tree (D1->T, D2->A, stdout);
10512 myexit (EXIT_SUCCESS);
10514 else if ( strm(action, "change_bootstrap"))
10516 D1->T=reset_boot_tree ( D1->T, (n_actions>=2)?atoi(action_list[1]):0);
10518 else if ( strm(action, "change_distances"))
10520 D1->T=reset_dist_tree ( D1->T, (n_actions>=2)?atof(action_list[1]):0.00);
10523 else if ( strm(action, "aln2tree"))
10525 D1->T=tree_compute (D1->A, n_actions-1, action_list+1);
10527 else if ( strm(action, "similarities2tree"))
10529 D1->T=similarities_file2tree (ACTION(1));
10532 else if ( strm(action, "original_seqnos"))
10534 (D1->A)->output_res_num=2;
10537 else if ( strm (action, "aln2pred"))
10539 aln2pred (D1->A, D2->A, ACTION (1));
10540 myexit (EXIT_SUCCESS);
10542 else if ( strm(action, "evaluate"))
10547 DST->A=copy_aln (D1->A, NULL);
10548 DST->S=aln2seq(DST->A);
10549 if (n_actions>1 && strm ( action_list[1], "categories"))
10551 CL=declare_constraint_list ( DST->S,NULL, NULL, 0,NULL, read_matrice("pam250mt"));
10552 DST->A= main_coffee_evaluate_output(DST->A, CL, "categories");
10554 else if (n_actions>1 && strm ( action_list[1], "sar"))
10556 CL=declare_constraint_list ( DST->S,NULL, NULL, 0,NULL, read_matrice("pam250mt"));
10557 DST->A= main_coffee_evaluate_output(DST->A, CL, "sar");
10559 else if (n_actions>1 && strstr ( action_list[1], "boxshade"))
10561 char color_mode[1000];
10562 sprintf (color_mode,"boxshade_%d", atoi(ACTION2(2,"30")));
10563 CL=declare_constraint_list ( DST->S,NULL, NULL, 0,NULL, read_matrice("pam250mt"));
10564 DST->A= main_coffee_evaluate_output(DST->A, CL, color_mode);
10568 CL=declare_constraint_list ( DST->S,NULL, NULL, 0,NULL, read_matrice((n_actions==1)?"pam250mt":action_list[1]));
10569 DST->A= main_coffee_evaluate_output(DST->A, CL, "matrix");
10572 DST->S=aln2seq ( DST->A);
10576 sprintf ( A->name[A->nseq], "cons");
10577 sprintf ( A->seq_al[A->nseq], "%s", aln2cons_seq_mat (A, "idmat"));
10580 else if ( strm (action, "sp_evaluate"))
10582 fprintf ( stdout, "SP Score: %.2f", sum_pair ((DST && DST->A)?DST->A:D1->A,ACTION(1),atoi(ACTION2(2,"0")),atoi(ACTION2(3,"0"))));
10583 myexit (EXIT_SUCCESS);
10585 else if ( strm (action, "lat_evaluate"))
10588 score=lat_sum_pair ( D1->A, action_list[1]);
10589 fprintf ( stdout, "\nLAT_SCORE: %.2f", score);
10590 myexit (EXIT_SUCCESS);
10593 else if ( strm (action, "add_scale"))
10595 D1->A=aln2scale (D1->A, ACTION(1));
10597 else if ( strm (action, "RNAfold_cmp"))
10599 D1->A=compare_RNA_fold (D1->A, D2->A);
10601 else if ( strm (action, "aln2alifold"))
10603 D1->A=aln2alifold (D1->A);
10604 D1->S=aln2seq ( D1->A);
10608 else if ( strm (action, "add_alifold"))
10610 D1->A=add_alifold2aln (D1->A, (D2)?D2->A:NULL);
10613 else if ( strm (action, "alifold2analyze"))
10615 D1->A=alifold2analyze (D1->A, (D2)?D2->A:NULL, ACTION(1));
10616 D1->S=aln2seq(D1->A);
10618 else if ( strm (action, "aln2conservation"))
10620 D1->A=aln2conservation ( D1->A, ATOI_ACTION (1), ACTION (2));
10621 myexit (EXIT_FAILURE);
10623 else if ( strm (action, "aln2cons"))
10627 cons_name=vcalloc (100, sizeof (char));
10628 sprintf(cons_name, "%s", (n_actions<=2)?"Cons":action_list[2]);
10629 cons_seq=aln2cons_seq_mat (D1->A, (n_actions==1)?"blosum62mt":action_list[1]);
10630 free_aln (D1->A);free_sequence(D1->S, -1);
10631 D1->S=fill_sequence_struc (1, &cons_seq, &cons_name);
10633 (D1->S)->len[0]=strlen (cons_seq); sprintf ( (D1->S)->seq[0], "%s", cons_seq);
10634 D1->A=seq2aln (D1->S, NULL, KEEP_GAP);
10635 vfree (cons_name);vfree (cons_seq);
10637 else if ( strm (action, "seq2filter"))
10639 D1->S=seq2filter ( D1->S, atoi(action_list[1]), atoi(action_list[2]));
10642 else if ( strm (action, "aln2resindex"))
10644 //-in: aln, file: ref_seq ref_res target_seq
10645 //-in2 target sequences
10646 aln2resindex (D1->A, (D2)?D2->A:NULL, stdout);
10647 myexit (EXIT_SUCCESS);
10649 else if (strm(action, "keep_name"))
10651 RAD->keep_name=1-RAD->keep_name;
10653 else if (strm(action, "use_consensus") ||strm(action, "use_cons") )
10655 RAD->use_consensus=1-RAD->use_consensus;
10657 else if ( strm(action, "ungap"))
10659 seq2aln (D1->S, D1->A, 1);
10661 else if ( strm2(action, "rmgap", "rm_gap"))
10664 ungap_aln_n (D1->A, (n_actions==1)?100:atoi(action_list[1]));
10665 //free_sequence ( D1->S, (D1->S)->nseq);
10666 D1->S=aln2seq ( D1->A);
10669 else if ( strm(action, "rmgap_col"))
10671 D1->A=remove_gap_column ( D1->A,action_list[1]);
10673 else if ( strm(action,"random"))
10676 D1->A= make_random_aln(NULL,(n_actions==1)?1:atoi(action_list[1]),(n_actions==2)?100:atoi(action_list[2]),"acdefghiklmnpqrstvwy");
10678 D1->S=aln2seq ( D1->A);
10681 else if ( strm(action, "landscape"))
10684 set_landscape_msa ((n_actions==1)?0:atoi(action_list[1]));
10686 else if ( strm(action, "clean_maln"))
10690 fprintf ( stderr,"\n[You Need an evaluation File: Change the output format][FATAL:%s]\n", PROGRAM);
10691 myexit(EXIT_FAILURE);
10693 (DST->A)=aln2number (DST->A);
10694 D1->A=clean_maln(D1->A, DST->A,(n_actions==1)?1:atoi(action_list[1]),(n_actions==1)?1:atoi(action_list[2]));
10696 else if ( strm (action, "extract"))
10699 COOR=get_pir_sequence (RAD->coor_file, NULL);
10700 D1->S=extract_sub_seq ( COOR, D1->S);
10702 D1->A=declare_Alignment(D1->S);
10703 seq2aln (D1->S, D1->A, RAD->rm_gap);
10704 free_sequence (COOR, COOR->nseq);
10706 else if ( strm (action, "reorder_column"))
10711 Alignment *RO1, *RO2;
10715 RO1=rotate_aln (D1->A,NULL);
10716 if (ACTION(1) && strm (ACTION(1), "tree"))
10718 D1->T=tree_compute (RO1,n_actions-2, action_list+2);
10719 OUT_S=tree2seq(D1->T, NULL);
10720 RO1=reorder_aln(RO1, OUT_S->name, OUT_S->nseq);
10722 else if ( ACTION(1) && strm (ACTION(1), "random"))
10724 RO1=reorder_aln ( RO1, NULL, RO1->nseq);
10727 RO2=rotate_aln (RO1, NULL);
10728 for (s=0; s< RO2->nseq; s++)
10729 sprintf ( RO2->name[s], "%s", (D1->A)->name[s]);
10733 D1->S=aln2seq(D1->A);
10736 else if ( strm (action, "reorder"))
10739 if ( n_actions==2 && strm (action_list[1], "random"))
10741 D1->A=reorder_aln ( D1->A, NULL, (D1->A)->nseq);
10743 else if (n_actions==2 && strm (action_list[1], "invert"))
10748 nname=declare_char ((D1->A)->nseq, 100);
10749 for ( z=0,y=(D1->A)->nseq-1; z<(D1->A)->nseq; z++, y--)
10751 sprintf (nname[z], "%s",(D1->A)->name[y]);
10754 D1->A=reorder_aln ( D1->A, nname, (D1->A)->nseq);
10755 free_char (nname, -1);
10757 else if (n_actions==2 && strm (action_list[1], "scramble"))
10759 D1->A=aln2scramble_seq(D1->A);
10762 else if ( n_actions==2 && strm (action_list[1], "tree"))
10765 OUT_S=tree2seq (D2->T, NULL);
10766 D1->A=reorder_aln(D1->A, OUT_S->name, OUT_S->nseq);
10767 free_sequence (D1->S,(D1->S)->nseq);
10768 D1->S=aln2seq (D1->A);
10772 (D2->A)->S=aln2seq (D2->A);
10773 (D1->A)->S=aln2seq (D1->A);
10774 OUT_S=trim_aln_seq_name(D2->A, D1->A);
10775 D1->A=reorder_aln(D1->A, OUT_S->name, OUT_S->nseq);
10776 free_sequence (D1->S,(D1->S)->nseq);
10777 D1->S=aln2seq (D1->A);
10780 else if ( strm (action, "aln2replicate"))
10782 aln2N_replicate (D1->A, ACTION(1), ACTION(2));
10784 else if ( strm (action, "paralogous_cat"))
10786 D1->A=orthologous_concatenate_aln (D1->A,D2->S, ACTION (1));
10789 else if ( strm (action, "cat_aln"))
10791 /*D1->A=aln_cat ( D1->A, D2 ->A);*/
10793 if (D2 && D2->A && !ACTION(1))
10794 D1->A=concatenate_aln (D1->A, D2->A, ACTION(1));
10795 else if (ACTION(1) && is_aln(ACTION(1)))
10803 B=main_read_aln (ACTION(n), NULL);
10804 D1->A=concatenate_aln (D1->A, B, NULL);
10807 D1->S=aln2seq(D1->A);
10814 A=main_read_aln ((D1->A)->name[0], NULL);
10816 for ( a=1; a<(D1->A)->nseq; a++)
10818 B=main_read_aln ((D1->A)->name[a], NULL);
10819 A=concatenate_aln (A, B, ACTION(1));
10823 D1->S=aln2seq(D1->A);
10827 else if ( strm ( action, "msalist2cat_pwaln"))
10839 min=atoi(action_list[1]);
10840 max=atoi(action_list[2]);
10843 fprintf ( stdout, ">A\n");
10844 for (a=0;a<(D1->S)->nseq; a++)
10847 HERE ("process %s", (D1->S)->name[a]);
10848 A=main_read_aln((D1->S)->name[a],NULL);
10849 for (b=0; b<A->nseq-1; b++)
10851 for ( c=b+1; c<A->nseq; c++)
10853 sim=get_seq_sim (A->seq_al[b], A->seq_al[c], "-", "");
10854 if (sim>=min && sim<=max)fprintf (stdout, "xxx%s", A->seq_al[b]);
10859 fprintf ( stdout, "\n>B\n");
10860 for (a=0;a<(D1->S)->nseq; a++)
10863 HERE ("process %s", (D1->S)->name[a]);
10864 A=main_read_aln((D1->S)->name[a],NULL);
10865 for (b=0; b<A->nseq-1; b++)
10867 for ( c=b+1; c<A->nseq; c++)
10869 sim=get_seq_sim (A->seq_al[b], A->seq_al[c], "-", "");
10870 if (sim>=min && sim<=max)fprintf (stdout, "xxx%s", A->seq_al[c]);
10876 fprintf ( stdout, "\n");
10877 myexit (EXIT_SUCCESS);
10880 else if ( strm (action, "collapse_tree"))
10882 D1->T=tree2collapsed_tree (D1->T, n_actions-1, action_list+1);
10884 else if ( strm (action, "collapse_aln"))
10886 D1->A=aln2collapsed_aln (D1->A, n_actions-1, action_list+1);
10888 else if ( strm (action, "extract_aln"))
10890 D1->A=aln2sub_aln_file (D1->A, n_actions-1, action_list+1);
10891 myexit (EXIT_SUCCESS);
10896 else if ( strm (action, "remove_aa"))
10899 pos=atoi(action_list[1]);
10900 len=atoi(action_list[2]);
10901 n=atoi (action_list[3]);
10902 if ( atoi (action_list[4])==1)len=-len;
10905 fprintf ( stderr, "\nWARNING: rm_aa, position (pos) and iteration number (n) simulatneously defined. Iteration number reset to 1 [%s]\n", PROGRAM);
10908 for ( a=0; a< n; a++)
10909 D1->A=probabilistic_rm_aa (D1->A, pos, len);
10911 else if ( strm (action, "remove_nuc"))
10914 pos=atoi(action_list[1]);
10916 if ( pos>3 || pos<1)
10917 printf_exit (EXIT_FAILURE, stderr, "Remove_nuc: indicate a number between 1 and 3\n");
10920 for ( c=0,a=0; a<(D1->A)->len_aln; a++, c++)
10923 for (b=0; b<(D1->A)->nseq; b++)
10927 (D1->A)->seq_al[b][a]='-';
10932 D1->S=aln2seq (D1->A);
10935 else if (strm ( action, "conserved_positions"))
10943 for ( a=0; a< A->nseq && !cache; a++)
10945 if ( strm (action_list[1], A->name[a]))
10947 cache=vcalloc ( A->len_aln+1, sizeof (int));
10948 for ( c=0,b=0; b<A->len_aln; b++)
10950 if ( is_gap (A->seq_al[a][b]))cache[b]=-1;
10956 for ( a=0; a< A->len_aln; a++)
10958 r1=A->seq_al[0][a];
10959 if ( is_gap(r1))continue;
10960 for ( c=0,b=0; b<A->nseq; b++)
10962 r2=A->seq_al[b][a];
10965 if ( (c*100)/A->nseq>=atoi(action_list[2]))
10966 fprintf ( stdout, "COL: %d Res: %c %s %d\n", a+1, r1, action_list[1], cache[a]+atoi(action_list[3]));
10968 myexit (EXIT_FAILURE);
10970 else if (strm ( action, "extract_block") )
10973 BUF=copy_aln (D1->A, NULL);
10974 if ( check_file_exists(action_list[1]))
10975 BUF=extract_aln3(BUF,action_list[1]);
10977 BUF=extract_aln2(BUF,atoi(action_list[2]),atoi(action_list[3]),action_list[1]);
10978 D1->A=copy_aln (BUF,D1->A);
10981 else if ( strm ( action, "extract_pos_list"))
10983 D1->A=alnpos_list2block (D1->A, n_actions-1, action_list+1);
10985 else if ( strm ( action, "seq2msa"))
10987 D1->A=simple_progressive_aln ( D1->S, NULL, NULL, action_list[1]);
10989 else if ( strm ( action, "realign_block") )
10991 D1->A=realign_block ( D1->A, atoi (action_list[1]), atoi (action_list[2]), (n_actions==4)?action_list[3]:NULL);
10993 else if ( strm (action, "extract_seq"))
10996 if ( check_file_exists (action_list[1])&& format_is_fasta (action_list[1]))
10999 BUFS=main_read_seq (action_list[1]);
11000 action_list=BUFS->name;
11001 n_actions=BUFS->nseq;
11010 for ( a=0; a< n_actions;)
11014 if ( n_actions==1 || is_file==1)
11023 start=(strm2 (s,"#","*"))?1:(atoi(action_list[a+1]));
11024 end= (strm2 (action_list[a+2],"#","*"))?0:(atoi(action_list[a+2]));
11028 if ( strm2 (s, "#", "*"))
11030 OUT_S=extract_one_seq((D1->A)->name[0],start, end, D1->A, RAD->keep_name);
11031 for (b=1; b< (D1->A)->nseq; b++)
11033 NS=extract_one_seq((D1->A)->name[b],start, end, D1->A, RAD->keep_name);
11034 if (count_n_res_in_array(NS->seq[0], -1))
11035 OUT_S=add_sequence ( NS,OUT_S, 0);
11040 if ( a==1)OUT_S=extract_one_seq(s,start, end, D1->A, RAD->keep_name);
11043 NS=extract_one_seq(s,start, end, D1->A, RAD->keep_name);
11044 OUT_S=add_sequence ( NS,OUT_S, 0);
11050 D1->A=declare_Alignment(D1->S);
11051 seq2aln (D1->S, D1->A, RAD->rm_gap);
11054 else if ( strm (action, "extract_seq_list"))
11056 if ( check_file_exists (action_list[1]) && format_is_fasta (action_list[1]))
11059 BUFS=main_read_seq (action_list[1]);
11060 action_list=BUFS->name;
11061 n_actions=BUFS->nseq;
11069 for ( a=0; a< n_actions;a++)
11072 NS=extract_one_seq(action_list[a],1,0, D1->A, KEEP_NAME);
11073 OUT_S=add_sequence ( NS,OUT_S, 0);
11079 D1->A=declare_Alignment(D1->S);
11080 seq2aln (D1->S, D1->A, RAD->rm_gap);
11082 else if ( strm (action, "remove_seq") || strm (action, "rm_seq"))
11089 list=declare_char ((D1->S)->nseq, 200);
11091 buf=vcalloc ((D1->S)->max_len+1, sizeof (char));
11092 for ( n=0,a=0; a< (D1->A)->nseq; a++)
11095 sprintf (buf, "%s", (D1->S)->seq[a]);
11099 for (c=1, b=1; b< n_actions; b++)
11101 if ( strm (action_list[b], (D1->S)->name[a])){(D1->S)->seq[a]=NULL;break;}
11102 else if ( strm (action_list[b], "empty") && l==0)
11104 fprintf ( stderr, "WARNING: Sequence %s does not contain any residue: automatically removed from the set [WARNING:%s]\n",(D1->S)->name[a], PROGRAM);
11105 (D1->S)->seq[a]=NULL;break;
11107 else if ( strm (action_list[b], "unique"))
11109 if ( name_is_in_list ((D1->S)->name[a], list,n, 100)!=-1)
11111 (D1->S)->seq[a]=NULL;break;
11115 sprintf ( list[n++], "%s", (D1->S)->name[a]);
11120 D1->S=duplicate_sequence (D1->S);
11122 free_char ( list, -1);
11123 D1->A=declare_Alignment(D1->S);
11124 seq2aln (D1->S, D1->A, RAD->rm_gap);
11127 else if ( strm (action, "aln2overaln")|| strm (action,"overaln_param"))
11129 //mode (lower|number|uanlign) Penalty (0-100) Thresold (0-9)
11133 char clean_mode[100];
11136 F=vcalloc (1, sizeof (OveralnP));
11139 D1->A=mark_exon_boundaries (D1->A, D2->A);
11142 else if ( (s=get_string_variable ("exon_boundaries")))
11146 EB=seq2aln(S=main_read_seq(s),NULL, 0);
11147 D1->A=mark_exon_boundaries (D1->A, EB);
11148 free_sequence (S, S->nseq); free_aln (EB);
11153 if (ACTION(1)==NULL)sprintf (F->mode, "lower");
11154 else if (strstr (ACTION(1), "h"))
11156 fprintf ( stdout, "aln2unalign lower|number|unalign|uanlign2 F P1 P2 P3 T\n");
11157 myexit (EXIT_SUCCESS);
11159 else sprintf (F->mode, "%s", ACTION(1));
11161 F->t=ATOI_ACTION(2);
11162 F->f=ATOI_ACTION(3);
11163 F->p1=ATOI_ACTION(4);
11164 F->p2=ATOI_ACTION(5);
11165 F->p3=ATOI_ACTION(6);
11166 F->p3=ATOI_ACTION(7);
11168 if (int_variable_isset ("overaln_target"))f=get_int_variable ("overaln_target");
11169 if (int_variable_isset ("overaln_threshold"))t=get_int_variable ("overaln_threshold");
11170 if (eb)sprintf (F->model, "fsa2");
11171 else sprintf (F->model, "fsa1");
11173 D1->A=aln2clean_pw_aln (D1->A, F);
11176 else if ( strm (action, "unalign_groups"))
11178 //unalign everything in lower case
11179 unalign_aln_2 (D1->A, NULL, 0);
11181 else if ( strm (action,"aln2unalign"))
11185 SA=copy_aln (D1->A, NULL);
11188 thread_seq_struc2aln (SA, SS);
11189 D1->A=unalign_aln (D1->A,SA, ATOI_ACTION(1));
11190 D1->S=aln2seq ( D1->A);
11192 else if ( strm (action, "clean_cdna"))
11196 for (a=0; a< A->nseq; a++)
11201 f=get_longest_frame (d, 3);
11202 buf=vcalloc ( strlen (d)+1, sizeof (char));
11203 sprintf (buf, "%s", d+f);
11204 sprintf (d, "%s", buf);
11208 else if ( strm (action, "clean_cdna2"))
11210 D1->A=clean_cdna_aln ( D1->A);
11211 free_sequence ( D1->S, (D1->S)->nseq);
11212 D1->S=aln2seq ( D1->A);
11214 else if ( strm (action, "aln2short_aln"))
11216 D1->A=aln2short_aln (D1->A, action_list[1], action_list[2], atoi(action_list[3]));
11217 free_sequence ( D1->S, (D1->S)->nseq);
11218 D1->S=aln2seq ( D1->A);
11220 else if ( strm ( action, "complement"))
11222 D1->A=complement_aln (D1->A);
11223 free_sequence ( D1->S, (D1->S)->nseq);
11224 D1->S=aln2seq ( D1->A);
11226 else if ( strm ( action, "extend"))
11228 extend_seqaln( NULL,D1->A);
11229 free_sequence ( D1->S, (D1->S)->nseq);
11230 D1->S=aln2seq ( D1->A);
11232 else if ( strm ( action, "unextend"))
11234 unextend_seqaln( NULL,D1->A);
11235 free_sequence ( D1->S, (D1->S)->nseq);
11236 D1->S=aln2seq ( D1->A);
11238 else if ( strm ( action, "translate"))
11240 D1->A=translate_dna_aln( D1->A,(n_actions==1)?0:atoi(action_list[1]));
11241 free_sequence ( D1->S, (D1->S)->nseq);
11242 D1->S=aln2seq ( D1->A);
11244 else if (strm2 ( action, "back_translate","backtranslate"))
11246 D1->A=back_translate_dna_aln( D1->A);
11247 free_sequence ( D1->S, (D1->S)->nseq);
11248 D1->S=aln2seq ( D1->A);
11250 else if (strm ( action, "rotate"))
11252 D1->A=rotate_aln( D1->A, action_list[1]);
11253 free_sequence ( D1->S, (D1->S)->nseq);
11254 D1->S=aln2seq ( D1->A);
11256 else if (strm ( action, "invert"))
11258 D1->A=invert_aln( D1->A);
11259 free_sequence ( D1->S, (D1->S)->nseq);
11260 D1->S=aln2seq ( D1->A);
11262 else if (strm ( action, "test_dna2gene"))
11264 testdna2gene ((D1->S)->seq[0]);
11266 else if (strm ( action, "code_dna_aln"))
11268 D1->A=code_dna_aln( D1->A);
11269 free_sequence ( D1->S, (D1->S)->nseq);
11270 D1->S=aln2seq ( D1->A);
11273 else if ( strm ( action, "mutate"))
11275 D1->A=mutate_aln( D1->A,(n_actions==1)?"0":action_list[1]);
11276 free_sequence ( D1->S, (D1->S)->nseq);
11277 D1->S=aln2seq (D1->A);
11279 else if ( strm ( action, "thread_profile_on_msa"))
11282 D1->A=thread_profile_files2aln (D1->A, action_list[1], NULL);
11283 D1->S=aln2seq(D1->A);
11285 else if ( strm ( action, "thread_dna_on_prot_aln"))
11287 D1->A=thread_dnaseq_on_prot_aln (D1->S, D2->A);
11288 free_sequence (D1->S,(D1->S)->nseq);
11289 D1->S=aln2seq (D1->A);
11291 else if ( strm ( action, "thread_struc_on_aln"))
11293 thread_seq_struc2aln ( D2->A, D1->S);
11294 D1->A=copy_aln(D2->A, D1->A);
11295 free_sequence ( D1->S, (D1->S)->nseq);
11296 D1->S=aln2seq (D1->A);
11298 else if ( strm (action, "sim_filter"))
11300 D1->A=sim_filter (D1->A, action_list[1], ACTION (2));
11301 free_sequence (D1->S,(D1->S)->nseq);
11302 D1->S=aln2seq (D1->A);
11304 else if ( strm (action, "seq2blast"))
11306 D1->A=seq2blast (D1->S);
11307 free_sequence (D1->S,(D1->S)->nseq);
11308 D1->S=aln2seq (D1->A);
11311 else if ( strm (action, "trim"))
11313 D1->A=simple_trimseq (D1->A,(D2)?D2->A:NULL, action_list[1], ACTION (2), NULL);
11315 free_sequence (D1->S,(D1->S)->nseq);
11316 D1->S=aln2seq (D1->A);
11319 else if (strm ( action, "trimTC"))
11321 value=(n_actions==1)?10:atoi(action_list[1]);
11323 D1->A=tc_trimseq(D1->A,D1->S,action_list[1]);
11324 free_sequence (D1->S,(D1->S)->nseq);
11325 D1->S=aln2seq (D1->A);
11327 else if (strm ( action, "trimTC2"))
11331 char trim_mode[100];
11332 if ( n_actions==1 || !(strm (action_list[1], "NSEQ") ||strm (action_list[1], "MINID")) )
11334 fprintf ( stderr, "\nTrimTC2 <NSEQ | MINID> <number sequences| minimum identity> (<matrix>)\n");
11335 myexit (EXIT_FAILURE);
11337 sprintf (trim_mode, "%s", action_list[1]);action_list+=2; n_actions-=2;
11339 if ( strm ( trim_mode, "NSEQ"))
11341 group_file=tree2Ngroup( (D1)?D1->A:NULL, (D2)?D2->T:NULL, atoi (action_list[0]), vtmpnam(NULL), (n_actions==1)?"idmat":action_list[1]);
11345 group_file=tree2Ngroup( (D1)?D1->A:NULL, (D2)?D2->T:NULL, -1*atoi (action_list[0]), vtmpnam(NULL), (n_actions==1)?"idmat":action_list[1]);
11348 B=copy_aln (D1->A, B);
11349 B=aln2sub_aln_file (B,1,&group_file);
11350 B=aln2sub_seq (B, 1, &group_file);
11351 D1->A=extract_sub_aln2 (D1->A, B->nseq, B->name);
11353 else if ( strm (action, "chain"))
11355 D1->A=seq2seq_chain (D1->A,D2->A, ACTION(2));
11359 else if (strm ( action, "master_trim"))
11361 value=(n_actions==1)?10:atoi(action_list[1]);
11363 D1->A=master_trimseq(D1->A,D1->S,action_list[1]);
11364 free_sequence (D1->S,(D1->S)->nseq);
11365 D1->S=aln2seq (D1->A);
11367 else if ( strm (action, "force_aln"))
11369 char ***rlist=NULL;
11374 if (!is_lib_02(action_list[1]))
11376 fprintf ( stderr, "\nERROR: force_aln requires files in TC_LIB_FORMAT_02 [FATAL:%s]", PROGRAM);
11377 myexit (EXIT_FAILURE);
11380 rlist=file2list (action_list[1], " ");
11384 rlist=declare_arrayN(3, sizeof (char),3,7, 10);
11386 strcat (rlist[1][1],action_list[1]);strcat (rlist[1][3],action_list[2]);
11387 strcat (rlist[1][4],action_list[3]);strcat (rlist[1][6],action_list[4]);
11388 sprintf ( rlist[2][0], "-1");
11391 while (rlist[count] && atoi(rlist[count][0])!=-1)
11393 char st1[100], st2[100], st3[100], st4[100];
11395 sprintf ( st1, "%s", rlist[count][1]);sprintf ( st2, "%s", rlist[count][3]);
11396 sprintf ( st3, "%s", rlist[count][4]);sprintf ( st4, "%s", rlist[count][6]);
11397 fprintf ( stderr, "\nFORCE: %s %s %s %s", st1, st2, st3, st4);
11399 if (is_number (st1))s1=atoi (st1)-1;
11400 else s1=name_is_in_list (st1,(D1->A)->name, (D1->A)->nseq, 100);
11401 if ( s1<0 || s1>= (D1->A)->nseq)crash ("wrong sequence index");
11404 if (is_number (st3))s2=atoi (st3)-1;
11405 else s2=name_is_in_list (st3,(D1->A)->name, (D1->A)->nseq, 100);
11406 if ( s2<0 || s2>= (D1->A)->nseq)crash ("wrong sequence index");
11409 (D1->A)=add_constraint2aln ((D1->A), s1, r1, s2, r2);
11412 fprintf ( stderr, "\n");
11413 free_arrayN((void*)rlist,3);
11416 else if (strm ( action, "grep"))
11418 D1->A=grep_seq (D1->A, ACTION(1),ACTION(2), ACTION(3));
11419 if (D1->A==NULL) myexit (EXIT_SUCCESS);
11420 else D1->S=aln2seq (D1->A);
11423 else if (strm (action, "find"))
11426 char *search_string;
11428 search_string=vcalloc ( 30, sizeof (char));
11429 if ( strm (action_list[1], "lower"))sprintf ( search_string, "abcdefghijklmnopqrstuvwxyz");
11430 else if ( strm ( action_list[1], "upper"))sprintf ( search_string, "ABCDEFGHIJKLMNOPQRSTUVWXYZ");
11433 vfree (search_string);search_string=vcalloc ( strlen (action_list[1])+1, sizeof (char));
11434 sprintf (search_string, "%s", action_list[1]);
11437 for (a=0; a<(D1->A)->nseq; a++)
11438 for ( l=0,b=0; b< (D1->A)->len_aln; b++)
11440 r=(D1->A)->seq_al[a][b];
11442 if ( r!='\0' && strrchr (search_string, r))
11444 /*fprintf ( stdout, "%-15s res %c alnpos %4d seqpos %4d\n", (D1->A)->name[a], r, b+1, l);*/
11445 fprintf ( stdout, "%s %d %d\n", (D1->A)->name[a], l, l+1);
11448 myexit (EXIT_SUCCESS);
11450 else if ( strm (action, "merge_annotation"))
11452 D1->A=merge_annotation (D1->A, DST?DST->A:NULL, ACTION(1));
11453 D1->S=aln2seq (D1->A);
11455 else if ( strm (action, "color_residue"))
11461 DST->A=copy_aln (D1->A, NULL);
11462 DST->S=aln2seq (DST->A);
11463 for (a=0; a< (DST->S)->nseq; a++)ungap ((DST->S)->seq[a]);
11467 for (a=1; a<n_actions; a+=3)
11469 i=name_is_in_list(action_list[a], (D1->A)->name, (D1->A)->nseq, 100);
11472 (DST->S)->seq[i][atoi(action_list[a+1])-1]='0'+atoi(action_list[a+2])-1;
11474 else fprintf (stderr, "\nWARNING: Could not find Sequence %s", action_list[a]);
11483 fp=vfopen (action_list[1], "r");
11484 while (fscanf (fp, "%s %d %d\n", name, &pos, &val)==3)
11487 i=name_is_in_list(name, (D1->A)->name, (D1->A)->nseq, 100);
11488 if (i!=-1)(DST->S)->seq[i][pos-1]='0'+val;
11489 else fprintf (stderr, "\nWARNING: Could not find Sequence %s", action_list[a]);
11493 DST->A=seq2aln (DST->S, NULL, 1);
11495 else if ( strm (action, "edit_residue"))
11500 char mod[100], name[100];
11508 for (a=1; a<n_actions; a+=3)
11511 i=name_is_in_list(action_list[a], (D1->A)->name, (D1->A)->nseq, 100);
11514 pos=atoi(action_list[a+1]);
11517 sprintf (mod, "%s", action_list[a+2]);
11518 if ( strm (mod, "upper"))(D1->A)->seq_al[i][pos]=toupper((D1->A)->seq_al[i][pos]);
11519 else if ( strm (mod, "lower"))(D1->A)->seq_al[i][pos]=tolower((D1->A)->seq_al[i][pos]);
11520 else (D1->A)->seq_al[i][pos]=mod[0];
11522 else fprintf (stderr, "\nWARNING: Could not find Sequence %s", action_list[a]);
11528 fp=vfopen (action_list[1], "r");
11529 while (fscanf (fp, "%s %d %s\n", name, &pos, mod)==3)
11532 i=name_is_in_list(name, (D1->A)->name, (D1->A)->nseq, 100);
11536 if ( strm (mod, "upper"))(D1->A)->seq_al[i][pos]=toupper(A->seq_al[i][pos]);
11537 else if ( strm (mod, "lower"))A->seq_al[i][pos]=tolower(A->seq_al[i][pos]);
11538 else A->seq_al[i][pos]=mod[0];
11540 else fprintf(stderr, "\nWARNING: Could not find Sequence %s", action_list[1]);
11544 D1->S=aln2seq (D1->A);
11546 else if ( strm (action, "clean_flag"))
11548 clean_flag=1-clean_flag;
11550 else if ( strm (action, "aln2case"))
11552 D1->A=aln2case_aln (D1->A, ACTION(1), ACTION(2));
11553 D1->S=aln2seq(D1->A);
11556 else if ( strm5 (action, "convert","upper","lower", "keep", "switchcase"))
11560 if ( n_actions>1 && is_number (action_list[b]))
11562 lower_value=upper_value=atoi(action_list[b++]);
11564 else if ( n_actions>1 && strm (action_list[b], "gap"))
11566 DST=vcalloc (1,sizeof(Sequence_data_struc));
11567 DST->A=aln2gap_cache (D1->A,0);
11572 else if (n_actions>1 && action_list[b] && action_list[b][0]=='[')
11575 lower_value=atoi(strtok (action_list[b]+1, "-[]"));
11576 upper_value=atoi(strtok (NULL, "-[]"));
11582 lower_value=upper_value=-1;
11585 if ( n_actions >b ||strm (action, "keep") )
11587 if ( !RAD->symbol_list)RAD->symbol_list=declare_char (STRING, STRING);
11589 if ( strm (action, "keep") )sprintf ( RAD->symbol_list[RAD->n_symbol++], "#-");
11592 for (a=b; a< n_actions; a++)
11594 sprintf ( RAD->symbol_list[RAD->n_symbol], "%s", action_list[a]);
11600 for ( value=0; value<=9; value++)
11602 if ( lower_value==-1)value=-1;
11604 if ( (value>=lower_value && value<=upper_value)|| value==-1)
11606 if (strm(action,"convert")) D1->A=filter_aln_convert (D1->A, DST?DST->A:NULL,RAD->use_consensus,value,RAD->n_symbol, RAD->symbol_list);
11607 else if (strm(action,"upper"))D1->A=filter_aln_lower_upper (D1->A, DST?DST->A:NULL,RAD->use_consensus,value);
11608 else if (strm(action,"lower"))D1->A=filter_aln_upper_lower (D1->A, DST?DST->A:NULL,RAD->use_consensus,value);
11609 else if (strm(action,"switchcase"))D1->A=filter_aln_switchcase (D1->A, DST?DST->A:NULL,RAD->use_consensus,value);
11613 if (strm(action,"keep")) D1->A=filter_aln_convert (D1->A, DST?DST->A:NULL,RAD->use_consensus,value,RAD->n_symbol, RAD->symbol_list);
11615 if (value==-1)break;
11619 /*free_sequence (D1->S,(D1->S)->nseq);*/
11620 if (!D1->S)D1->S=aln2seq (D1->A);
11622 else if ( strm ( action, "count_pairs"))
11624 int a, b,c,v, **matrix;
11626 matrix=declare_int (300,300);
11628 for ( a=0; a< A->nseq-1; a++)
11629 for (b=0; b< A->nseq; b++)
11630 for (c=0; c<A->len_aln; c++)
11631 matrix[(int)A->seq_al[a][c]][(int)A->seq_al[b][c]]++;
11632 for ( a=0; a<255; a++)
11633 for ( b=a; b<256; b++)
11635 v=matrix[a][b]+matrix[b][a];
11636 if (v)fprintf ( stdout, "\n%c %c %d", a, b, v);
11638 myexit (EXIT_SUCCESS);
11640 else if ( strm (action, "count_misc"))
11642 count_misc (D1->A, (!D2)?NULL:D2->A);
11644 else if ( strm (action, "count"))
11647 if ( n_actions>1 && is_number (action_list[b]))
11649 lower_value=upper_value=atoi(action_list[b++]);
11651 else if (n_actions>1 && action_list[b] && action_list[b] && action_list[b][0]=='[')
11654 lower_value=atoi(strtok (action_list[b]+1, "-[]"));
11655 upper_value=atoi(strtok (NULL, "-[]"));
11661 lower_value=upper_value=-1;
11665 if ( !RAD->symbol_list)RAD->symbol_list=declare_char (STRING, STRING);
11667 for (a=b; a< n_actions; a++)
11669 sprintf ( RAD->symbol_list[RAD->n_symbol], "%s", action_list[a]);
11673 for ( value=lower_value; value<=upper_value; value++)
11675 count_table=count_in_aln (D1->A, DST?DST->A:NULL,value,RAD->n_symbol, RAD->symbol_list, count_table);
11677 for ( a=0; a<RAD->n_symbol; a++)
11679 fprintf ( stdout, "%s %d\n", RAD->symbol_list[a], count_table[a]);
11681 free_sequence (D1->S,(D1->S)->nseq);
11682 D1->S=aln2seq (D1->A);
11683 vfree(count_table);
11684 myexit(EXIT_SUCCESS);
11686 else if ( strm (action, "species_weight"))
11688 seq_weight2species_weight (D1->A, D2->S);
11691 else if ( strm (action, "aln2voronoi"))
11693 aln2voronoi_weights (D1->A);
11696 else if ( strm (action, "msa_weight"))
11699 char command [LONG_STRING];
11700 char aln_name[FILENAMELEN];
11701 char tree_name[FILENAMELEN];
11702 char dist_matrix_name[FILENAMELEN];
11703 char weight_name[FILENAMELEN];
11704 char method_4_msa_weights[1000];
11708 fprintf ( stderr, "\nError: msa_weight requires a weight_method");
11711 sprintf ( method_4_msa_weights, "%s", (get_env_variable ("METHOD_4_MSA_WEIGHTS",NO_REPORT))?get_env_variable ("METHOD_4_MSA_WEIGHTS",NO_REPORT):METHOD_4_MSA_WEIGHTS);
11713 /*1 Computation of the tree and the distance matrix*/
11714 random_value=addrand ((unsigned long) 100000)+1;
11715 sprintf (aln_name, "%d.aln", random_value);
11716 sprintf (tree_name, "%d.ph", random_value);
11717 sprintf (dist_matrix_name, "%d.dst", random_value);
11718 sprintf (weight_name, "%d.weight", random_value);
11719 output_fasta_aln (aln_name, D1->A);
11721 sprintf ( command, "clustalw -infile=%s -tree -outputtree=dist %s", aln_name, TO_NULL_DEVICE);
11722 my_system ( command);
11723 sprintf ( command, "%s -method %s -aln %s -tree %s -dmatrix %s -weightfile %s %s",method_4_msa_weights, action_list[1],aln_name, tree_name, dist_matrix_name,weight_name, TO_NULL_DEVICE);
11724 my_system ( command);
11726 (D1->A)->S=aln2seq (D1->A);
11727 ((D1->A)->S)->W=read_seq_weight ( (D1->A)->name, (D1->A)->nseq,weight_name);
11728 vremove (weight_name);
11729 vremove (aln_name);
11730 vremove (tree_name);
11731 vremove (dist_matrix_name);
11733 else if ( strm (action, "pavie_seq2random_seq"))
11735 D1->S=pavie_seq2random_seq (D1->S, action_list[1]);
11736 D1->A=seq2aln (D1->S,NULL,1);
11738 else if ( strm ( action, "pavie_seq2noisy_seq"))
11740 /*<amount of noise: 0-100> (<alp>)*/
11742 D1->S=pavie_seq2noisy_seq (D1->S, atoi(action_list[1]),ACTION(2));
11743 D1->A=seq2aln (D1->S,NULL,1);
11745 else if ( strm (action, "pavie_seq2pavie_mat"))
11748 pavie_seq2trained_pavie_mat ( D1->S, (n_actions==2)?action_list[1]:NULL);
11749 myexit (EXIT_SUCCESS);
11751 else if ( strm (action, "pavie_seq2pavie_aln"))
11754 pavie_seq2pavie_aln ( D1->S, action_list[1], ACTION(2));
11755 myexit (EXIT_SUCCESS);
11757 else if ( strm (action, "pavie_seq2pavie_dm"))
11759 if (strstr (ACTION2(2,""), "_MSA_"))
11760 D1->S=aln2seq_main(D1->A, KEEP_GAP);
11763 pavie_seq2pavie_aln ( D1->S, action_list[1],(n_actions==3)?action_list[2]:"_MATDIST_");
11764 myexit (EXIT_SUCCESS);
11766 else if ( strm (action, "pavie_seq2pavie_msa"))
11768 D1->A=pavie_seq2pavie_msa ( D1->S, action_list[1], (n_actions==3)?action_list[2]:NULL);
11770 else if ( strm (action, "pavie_seq2pavie_tree"))
11772 D1->T=pavie_seq2pavie_tree ( D1->S, action_list[1], (n_actions==3)?action_list[2]:NULL);
11774 else if ( strm (action, "pavie_seq2pavie_sort"))
11776 D1->A=pavie_seq2pavie_sort ( D1->S, action_list[1], (n_actions==3)?action_list[2]:NULL);
11779 else if ( strm (action, "aln2mat_diaa"))
11781 aln2mat_diaa (D1->S);
11783 else if ( strm (action, "aln2mat"))
11788 else if ( strm (action, "seq2latmat"))
11790 seq2latmat ( D1->S, "stdout");
11791 myexit (EXIT_SUCCESS);
11793 else if ( strm (action , "rm_target_pdb"))
11798 for (i=0; i< (D1->A)->nseq; i++)
11800 j=1;buf=(D1->A)->name[i];
11801 while (buf[j]!='_' && buf[j-1]!='_' && buf[j]!='\0')j++;
11805 else if ( strm ( action, "mat2cmp"))
11808 r=mat2cmp (D1->M, D2->M);
11809 fprintf ( stdout, "\nMATRIX COMPARISON: R=%.3f R2=%.3f On %d pairs of values\n", (float)r[0], (float)r[1], (int)r[2]);
11810 myexit (EXIT_SUCCESS);
11813 else if ( strm ( action, "overaln_list"))
11815 float *re, tre=0,sn, tsn=0, sp, tsp=0;
11816 int p1,p2,p3, t, f;
11822 HERE ("F P1 P2 P3 T");
11832 LA=vcalloc ((D1->A)->nseq, sizeof (Alignment*));
11833 LB=vcalloc ((D2->A)->nseq, sizeof (Alignment*));
11834 for (a=0; a<(D1->A)->nseq; a++)
11836 LA[a]=main_read_aln ((D1->A)->name[a], NULL);
11837 LB[a]=main_read_aln ((D2->A)->name[a], NULL);
11840 for ( a=0; a<(D1->A)->nseq; a++)
11845 re=analyze_overaln (A, B, "_case_l_",t,f,p1,p2,p3);
11846 fprintf (stdout, "\n%d: sn: %.2f sp: %.2f re: %.2f F: %d P: %d P2: %d T: %d",a, re[0],re[1],re[2],f, p1,p2,t);
11852 fprintf (stdout, "\nTOT: sn: %.2f sp: %.2f re: %.2f F: %d P: %d P2: %d T: %d", tsn/(D1->A)->nseq,tsp/(D1->A)->nseq, tre/(D1->A)->nseq,f,p1,p2,t);
11856 else if ( strm ( action, "overaln_list_scan"))
11858 float *re, tre=0, tsn=0, tsp;
11859 int p1,p2, p3, t, f;
11865 if ( ACTION(1))sprintf ( fname, "%s", ACTION(1));
11866 else sprintf ( fname, "scan_results.txt");
11868 fprintf ( stdout, "SCAN Results will be ouput in %s\n", fname);
11871 LA=vcalloc ((D1->A)->nseq, sizeof (Alignment*));
11872 LB=vcalloc ((D2->A)->nseq, sizeof (Alignment*));
11873 for (a=0; a<(D1->A)->nseq; a++)
11875 LA[a]=main_read_aln ((D1->A)->name[a], NULL);
11876 LB[a]=main_read_aln ((D2->A)->name[a], NULL);
11878 for (f=32; f<=40; f++)
11880 for (p1=90; p1<=100; p1+=5)
11882 for ( t=1; t<=3; t++)
11884 for (p2=0; p2<=40; p2+=5)
11886 for (p3=0;p3<=0;p3+=5)
11889 for ( a=0; a<(D1->A)->nseq; a++)
11894 re=analyze_overaln (A, B, "_case_l_",t,f,p1,p2,p3);
11901 fp=vfopen (fname, "a");
11902 fprintf (fp, "\nTOT: sn: %.2f sp: %.2f re: %.2f P: %d P2: %d P3: %d T: %d F: %d", tsn/(D1->A)->nseq,tsp/(D1->A)->nseq, tre/(D1->A)->nseq, p1,p2, p3,t,f);
11903 fprintf (stderr, "\nTOT: sn: %.2f sp: %.2f re: %.2f P: %d P2: %d P3: %d T: %d F: %d", tsn/(D1->A)->nseq,tsp/(D1->A)->nseq, tre/(D1->A)->nseq, p1,p2, p3,t,f);
11912 else if ( strm ( action, "overaln"))//Evaluate the capacity to predict over-aligned regions
11915 F=vcalloc (1, sizeof (OveralnP));
11918 //ATOI(1): P (0-100)
11922 DST=vcalloc (1,sizeof(Sequence_data_struc));
11923 DST->A=aln2gap_cache (D1->A,0);
11926 D1->A=filter_aln_upper_lower (D1->A, DST->A, 0, 0);
11928 sprintf (F->mode, "%s", ((s=get_string_variable ("overaln_mode")))?s:"lower");
11929 if (!strm (F->mode, "lower") && !strstr (F->mode, "unalign"))printf_exit (EXIT_FAILURE,stderr,"\nERROR: unknown overal_mode in overal output [%s] [FATAL:%s]", F->mode, PROGRAM);
11931 if (int_variable_isset ("overaln_threshold"))F->t=get_int_variable ("overaln_threshold");
11932 if (int_variable_isset ("overaln_target"))F->f=get_int_variable ("overaln_target");
11933 if (int_variable_isset ("overaln_P1"))F->f=get_int_variable ("overaln_P1");
11934 if (int_variable_isset ("overaln_P1"))F->f=get_int_variable ("overaln_P2");
11935 if (int_variable_isset ("overaln_P1"))F->f=get_int_variable ("overaln_P3");
11936 if (int_variable_isset ("overaln_P1"))F->f=get_int_variable ("overaln_P4");//F P1 P2 P3 T;
11938 D2->A=aln2clean_pw_aln (D2->A, F);
11939 r=aln2pred (D1->A, D2->A,"case_l_");
11940 fprintf ( stdout, "sn %.2f sp %.2f re %.2f\n", r[0], r[1], r[2]);
11946 else if ( strm ( action, "aln2hitMat"))
11948 aln2hitMat(D1->A, ACTION(1));
11949 myexit (EXIT_SUCCESS);
11955 fprintf ( stderr, "\nWARNING: ACTION %s UNKNOWN and IGNORED\n", action);
11961 void aln2mat_diaa (Sequence *S)
11963 int a, aa1, aa2, aa3, aa4;
11971 double Delta=0.00001;
11974 double observed, expected, f_diaa1, f_diaa2, v;
11977 alp=vcalloc (256, sizeof (int));
11978 for (a=0; a<26; a++)alp[a+'a']=1;
11986 m=declare_arrayN (4,sizeof (int),26,26,26,26);
11987 c=declare_arrayN (2,sizeof (int),26,26);
11989 for ( a=0; a< S->nseq; a++)
11991 fprintf ( stderr, "%s\n", S->name[a]);
11992 A=main_read_aln (S->name[a],NULL);
11993 for (s1=0; s1<A->nseq; s1++)lower_string (A->seq_al[s1]);
11995 for ( s1=0; s1<A->nseq-1; s1++)
11996 for (s2=s1+1; s2<A->nseq; s2++)
11998 for (p=0; p<A->len_aln-1; p++)
12001 u =alp[aa1=A->seq_al[s1][p]];
12002 u+=alp[aa2=A->seq_al[s1][p+1]];
12003 u+=alp[aa3=A->seq_al[s2][p]];
12004 u+=alp[aa4=A->seq_al[s2][p+1]];
12008 aa1-='a';aa2-='a';aa3-='a'; aa4-='a';
12012 m[aa1][aa2][aa3][aa4]++;
12019 fprintf ( stdout, "# DIAA_MATRIX_FORMAT_01\n");
12021 for (aa1=0; aa1<naa; aa1++)
12022 for (aa2=0; aa2<naa; aa2++)
12023 for (aa3=0; aa3<naa; aa3++)
12024 for (aa4=0; aa4<naa;aa4++)
12033 tot=m[aa1][aa2][aa3][aa4]+m[aa3][aa4][aa1][aa2];
12034 observed=((double)tot)/(double)((double)count/(double)2);
12035 f_diaa1=(double)c[aa1][aa2]/(double)count;
12036 f_diaa2=(double)c[aa3][aa4]/(double)count;
12038 expected=f_diaa1*f_diaa2;
12039 if (expected<Delta)v=0;
12040 else if (observed<Delta)v=-100;
12043 v=log(observed/expected)*10;
12045 // if (tot>0)fprintf ( stdout, "TEST C=%d expected=%.4f observed=%.4f v=%.4f [%d %d %d][%d] tot=%d\n", count, (float)expected, (float)observed, (float) v, c[aa1][aa2], c[aa3][aa4], count, m[aa1][aa2][aa3][aa4], tot);
12046 fprintf ( stdout, "%c%c %c%c %d %d\n", aa1+'a', aa2+'a', aa3+'a', aa4+'a', (int)v, m[aa1][aa2][aa3][aa4]+ m[aa3][aa4][aa1][aa2]);
12049 myexit (EXIT_SUCCESS);
12051 void aln2mat (Sequence *S)
12061 double Delta=0.00001;
12064 double observed, expected, f_diaa1, f_diaa2, v;
12067 balp=vcalloc ( 256, sizeof (char));
12068 for (a=0; a<strlen (BLAST_AA_ALPHABET); a++)balp[BLAST_AA_ALPHABET[a]]=a;
12070 mat=declare_int (256, 256);
12071 alp=vcalloc (256, sizeof (int));
12072 for (a=0; a<26; a++)alp[a+'a']=1;
12080 m=declare_arrayN (2,sizeof (int),26,26);
12081 c=declare_arrayN (1,sizeof (int),26);
12083 for ( a=0; a< S->nseq; a++)
12085 fprintf ( stderr, "%s\n", S->name[a]);
12086 A=main_read_aln (S->name[a],NULL);
12087 for (s1=0; s1<A->nseq; s1++)lower_string (A->seq_al[s1]);
12089 for ( s1=0; s1<A->nseq-1; s1++)
12090 for (s2=s1+1; s2<A->nseq; s2++)
12092 for (p=0; p<A->len_aln-1; p++)
12095 u =alp[aa1=A->seq_al[s1][p]];
12096 u+=alp[aa3=A->seq_al[s2][p]];
12111 fprintf ( stdout, "# MONOAA_MATRIX_FORMAT_01\n");
12113 for (aa1=0; aa1<naa; aa1++)
12114 for (aa3=0; aa3<naa; aa3++)
12121 tot=m[aa1][aa3]+m[aa3][aa1];
12122 observed=((double)tot)/(double)((double)count/(double)2);
12123 f_diaa1=(double)c[aa1]/(double)count;
12124 f_diaa2=(double)c[aa3]/(double)count;
12126 expected=f_diaa1*f_diaa2;
12127 if (expected<Delta)v=0;
12128 else if (observed<Delta)v=-100;
12131 v=log(observed/expected)/(log(2)/2);
12133 // if (tot>0)fprintf ( stdout, "TEST C=%d expected=%.4f observed=%.4f v=%.4f [%d %d %d][%d] tot=%d\n", count, (float)expected, (float)observed, (float) v, c[aa1][aa2], c[aa3][aa4], count, m[aa1][aa2][aa3][aa4], tot);
12134 //fprintf ( stdout, "%c %c %d %d\n", aa1+'A', aa3+'A', (int)v, m[aa1][aa3]+ m[aa3][aa1]);
12135 mat[aa1][aa3]=(int)v;
12138 output_blast_mat (mat, "stdout");
12139 myexit (EXIT_SUCCESS);
12143 int **seq2latmat ( Sequence *S, char *fname)
12150 double observed, expected;
12153 fp=vfopen (fname, "w");
12155 count=vcalloc ( 256, sizeof (int));
12156 mat=declare_int (256, 256);
12158 naa=strlen ( BLAST_AA_ALPHABET);
12159 aa=vcalloc ( naa+2, sizeof (char));
12160 sprintf ( aa, "%s", BLAST_AA_ALPHABET);
12163 for ( tot=0,a=0; a< S->nseq; a++)
12166 for ( b=1; b<S->len[a]; b++)
12168 r0=tolower(S->seq[a][b-1]);
12169 r1=tolower(S->seq[a][b]);
12177 for ( a=0; a< naa; a++)
12178 for (b=0; b< naa; b++)
12180 if ( aa[a]=='*' || aa[b]=='*');
12183 expected=((double)count[(int)aa[a]]/(double)tot)* ((double)count[(int)aa[b]]/(double)tot)*(double)tot;
12184 observed=((double)mat[(int)aa[a]][(int)aa[b]]);
12187 fprintf ( stderr, "\n%c=%d %c=%d Tot=%d Obs=%d Exp=%d\n", aa[a],count[aa[a]], aa[b],count[aa[b]],tot, mat[aa[a]][aa[b]],(int)expected);
12188 fprintf ( stderr, "\n%d", mat[aa[a]][aa[b]]);
12189 fprintf ( stderr, "\n%d", mat[aa[a]][aa[b]]);
12191 mat[(int)aa[a]][(int)aa[b]]=(expected==0 || observed==0)?0:((int)10*log((observed/expected)));
12195 fprintf (fp,"# BLAST_MATRIX FORMAT\n#ALPHABET=%s\n#TRANSITION MATRIX TRAINED ON %d Sequence\n#", BLAST_AA_ALPHABET, S->nseq);
12196 for (a=0; a< naa; a++)fprintf ( fp, "%3c ", toupper(aa[a]));
12198 for (a=0; a< naa; a++)
12201 fprintf (fp, "%c", toupper(aa[a]));
12202 for ( b=0; b< naa; b++)
12204 fprintf (fp, "%3d ", mat[(int)aa[a]][(int)aa[b]]);
12206 fprintf ( fp, "\n");
12215 double* mat2cmp ( int **mat1, int **mat2)
12219 if ( !mat1 || !mat2)
12221 fprintf ( stderr, "\nERROR: mat2cmp needs two matrices [FATAL:%s]", PROGRAM);
12222 myexit (EXIT_FAILURE);
12225 for (n=0, a=0; a< 256; a++)
12226 for ( b=0; b<256; b++)
12232 if ( n==0) return 0;
12233 list=declare_double (n, 2);
12235 for (n=0, a=0; a<256; a++)
12236 for ( b=0; b<256; b++)
12247 r=return_r (list, n);
12248 free_double(list, -1);
12252 int ** read_blast_matrix ( char *mat_name)
12259 char sbuf[VERY_LONG_STRING];
12263 matrix=declare_int (256,256);
12264 vfree ( matrix[30]);
12265 matrix[30]=vcalloc(10000, sizeof (int));
12266 fp=vfopen ( mat_name, "r");
12267 while ( (c=fgetc(fp))=='#' || isspace(c) )
12270 fgets ( sbuf, VERY_LONG_STRING, fp);
12271 if ( (p=strstr (sbuf, "ALPHABET")))
12272 sscanf (p, "ALPHABET=%s", alp);
12275 lower_string (alp);
12278 for ( a=0; a< n_aa; a++)
12280 fscanf ( fp, "%s ", buf);
12282 aa1=tolower(buf[0]);
12286 fprintf ( stderr, "\nParsing_error when reading blast_matrix %s:\n%c %c",mat_name, aa1,alp[a]);
12287 fprintf ( stderr, "\n%c ", fgetc(fp));
12288 myexit (EXIT_FAILURE);
12290 for ( b=0; b<n_aa; b++)
12292 aa2=tolower ((char) alp[b]);
12293 fscanf ( fp, "%d ", &value);
12294 if (is_gap(aa1) || is_gap(aa2))
12297 c1=(is_gap(aa1))?GAP_CODE:aa1;
12298 c2=(is_gap(aa2))?GAP_CODE:aa2;
12299 if ( c1==GAP_CODE && c2==GAP_CODE)
12300 matrix[c1][c2]=value;
12301 else if ( c1==GAP_CODE)
12303 matrix[c1][tolower(c2)]=value;
12304 matrix[c1][toupper(c2)]=value;
12308 matrix[tolower(c1)][c2]=value;
12309 matrix[toupper(c1)][c2]=value;
12312 else if ( aa1!='*' && aa2!='*')
12314 matrix[tolower(aa1)-'A'][tolower(aa2)-'A']=value;
12315 matrix[toupper(aa1)-'A'][toupper(aa2)-'A']=value;
12316 matrix[tolower(aa1)-'A'][toupper(aa2)-'A']=value;
12317 matrix[toupper(aa1)-'A'][tolower(aa2)-'A']=value;
12327 int output_blast_mat (int **mat, char *fname)
12329 return output_mat(mat, fname, BLAST_AA_ALPHABET, 'a');
12332 int output_header_mat (int **mat, char *fname, char *alp)
12338 char raa[]="ABCDEFGHIKLMNPQRSTVWXYZ";
12342 aa=vcalloc ( naa+2, sizeof (char));
12343 sprintf ( aa, "%s",raa);
12346 fp=vfopen (fname, "w");
12347 fprintf ( fp, "int new_mat[]={\n");
12349 for (a=0; a<naa; a++)
12351 for (b=0; b<=a; b++)
12353 fprintf (fp, "%3d, ", mat[aa[a]-'a'][aa[b]-'a']);
12355 fprintf (fp, "\n");
12357 fprintf ( fp, "}");
12360 int output_mat (int **mat, char *fname, char *alp, int offset)
12369 aa=vcalloc ( naa+2, sizeof (char));
12370 sprintf ( aa, "%s",alp);
12372 if (!(fp=vfopen (fname, "w")))return 0;
12373 fprintf (fp,"# BLAST_MATRIX FORMAT\n#ALPHABET=%s\n ",alp);
12374 for (a=0; a< naa; a++)fprintf ( fp, "%5c ", toupper(aa[a]));
12376 for (a=0; a< naa; a++)
12379 fprintf (fp, "%c", toupper(aa[a]));
12380 for ( b=0; b< naa; b++)
12382 if (aa[a]!='*' && aa[b]!='*')
12383 fprintf (fp, " %5d", mat[aa[a]-offset][aa[b]-offset]);
12385 fprintf (fp, " %5d", 0);
12387 fprintf ( fp, "\n");
12394 void output_pavie_mat (int **mat, char *fname, double gep, char *alp)
12400 fp=vfopen (fname, "w");
12401 fprintf (fp,"# PAVIE_MATRIX FORMAT\n#ALPHABET=%s\n",alp);
12403 for(a=0; a< n; a++)
12405 for ( b=a; b<n; b++)
12407 fprintf (fp, "%c %c %.3f\n", toupper(alp[a]), toupper(alp[b]), (float)mat[alp[a]-'A'][alp[b]-'A']/PAVIE_MAT_FACTOR);
12410 if ( gep!=UNDEFINED)fprintf ( fp, "- - %.3f\n", gep/PAVIE_MAT_FACTOR);
12414 int ** read_pavie_matrix ( char *mat_name)
12421 char sbuf[VERY_LONG_STRING];
12425 matrix=declare_int (256,256);
12428 fp=vfopen ( mat_name, "r");
12429 while ( (c=fgetc(fp))=='#' || isspace(c) )
12431 fgets ( sbuf, VERY_LONG_STRING, fp);
12432 if ( sscanf (sbuf, "ALPHABET=%s", alp)==1);
12437 while ( fgets ( sbuf, VERY_LONG_STRING, fp)!=NULL)
12440 if (sscanf (sbuf, "%c %c %f",&aa1, &aa2, &v)==3)
12442 v*=PAVIE_MAT_FACTOR;
12443 if (aa1=='-' && aa2=='-')gep=v;
12446 matrix[tolower(aa1)-'A'][tolower(aa2)-'A']=v;
12447 matrix[toupper(aa1)-'A'][toupper(aa2)-'A']=v;
12448 matrix[tolower(aa1)-'A'][toupper(aa2)-'A']=v;
12449 matrix[toupper(aa1)-'A'][tolower(aa2)-'A']=v;
12451 matrix[tolower(aa2)-'A'][tolower(aa1)-'A']=v;
12452 matrix[toupper(aa2)-'A'][toupper(aa1)-'A']=v;
12453 matrix[tolower(aa2)-'A'][toupper(aa1)-'A']=v;
12454 matrix[toupper(aa2)-'A'][tolower(aa1)-'A']=v;
12458 if ( gep!=UNDEFINED)
12461 for (a=0; a< n_aa; a++)
12463 if (!matrix[tolower(alp[a])-'A'][GAP_CODE])
12465 matrix[tolower(alp[a])-'A'][GAP_CODE]=gep;
12466 matrix[toupper(alp[a])-'A'][GAP_CODE]=gep;
12474 Sequence *seq2year ( Sequence *S, int modulo)
12479 char new_channel[100];
12481 sprintf( new_channel, "_agechannel%d",modulo);
12483 for ( a=0; a<S->nseq; a++)
12485 if (S->seq_comment[a] && (s=strstr(S->seq_comment[a], "_FIRSTYEAR")))
12487 sscanf (s, "_FIRSTYEAR%d_", &first);
12491 for ( y=first,b=0; b<S->len[a]; b++)
12493 if ( !is_gap(S->seq[a][b]))
12495 S->seq[a][b]='a'+((y/modulo))%10;
12499 if ( (s=strstr ( S->name[a], "_agechannel")))
12501 sprintf ( s, "%s", new_channel);
12503 else strcat (S->name[a], new_channel);
12508 Sequence* output_n_pavie_age_channel (Sequence *S, char *name, int n)
12514 for ( x=1,a=0; a< n; a++, x*=10)
12516 S=output_pavie_age_channel(S, name,x);
12524 Sequence* output_pavie_age_channel (Sequence *S, char *name, int modulo)
12528 static int display;
12529 char mat_list_name[100];
12530 char seq_list[1000];
12531 char mat_name[1000];
12534 sprintf ( mat_list_name, "%s_pavie_age_matrix.mat_list", name);
12535 sprintf (seq_list, "%s_age_channel.fasta",name);
12539 if (check_file_exists(seq_list))vremove (seq_list);
12540 if (check_file_exists(mat_list_name))vremove (mat_list_name);
12542 sprintf (mat_name, "%s_age_mat_mod%d.mat",name, modulo);
12543 output_age_matrix ( mat_name, modulo);
12545 fp=vfopen ( mat_list_name,"a");
12546 fprintf ( fp, "%s\n", mat_name);
12549 S=seq2year (S,modulo);
12550 A=seq2aln (S, NULL, KEEP_GAP);
12551 output_fasta_seq (tmp=vtmpnam (NULL),A);
12552 file_cat ( tmp, seq_list);
12556 display_output_filename ( stdout, "AGE_MAT_LIST", "MAT_LIST", mat_list_name, CHECK);
12557 display_output_filename ( stdout, "AGE_SEQ", "FASTA", seq_list, CHECK);
12560 fprintf ( stderr, "\nModulo:%d years", modulo);
12561 fprintf ( stderr, "\n");
12566 // Name MAnipulation
12569 Alignment *clean_aln (Alignment *A)
12573 A->seq_comment=clean_string (A->nseq, A->seq_comment);
12574 A->aln_comment=clean_string (A->nseq, A->aln_comment);
12575 A->name=translate_names(A->nseq, A->name);
12576 (A->S)=clean_sequence ((A->S));
12580 Sequence *clean_sequence ( Sequence *S)
12584 S->seq_comment=clean_string (S->nseq, S->seq_comment);
12585 S->name=translate_names(S->nseq, S->name);
12588 char ** translate_names (int n, char **name)
12591 for ( a=0; a<n; a++)
12592 name[a]=translate_name(name[a]);
12595 char * translate_name ( char *name)
12604 //if ( name[0]=='\'')return name;
12606 for ( a=0; a<len; a++)
12608 if ( isspace(name[a]))name[a]='\0';
12609 else if (strchr (";(),:#><'�", name[a]))name[a]='_';
12612 sprintf (buf,"%s",decode_name (name, DECODE));
12613 if ( strlen (buf)>read_array_size_new ((char *)name))
12615 name=vrealloc (name, sizeof (char)*(strlen (buf)+1));
12617 sprintf (name, "%s", buf);
12621 char *decode_name (char *name, int mode)
12623 static char ***name_list;
12625 static char tag[100];
12630 for (a=0; a<n; a++)
12632 vfree (name_list[a][0]);
12633 vfree (name_list[a][1]);
12634 vfree (name_list[a]);
12641 if ( mode == CODELIST)
12644 file=vtmpnam (NULL);
12645 for (a=0; a< n; a++)
12646 printf_file(file, "a", "#CODE: %s <=> %s\n", name_list[a][0], name_list[a][1]);
12649 if (mode ==DECODE && name_list==NULL)return name;
12650 if ( name==NULL) return name;
12657 sprintf ( tag, "TCTAG_%d",rand ()%100000);
12662 for (a=0; a< n; a++)
12663 if ( strm (name, name_list[a][0]))return name_list[a][1];
12666 name_list=realloc (name_list, sizeof (char**)*(n+1));
12667 name_list[n]=vcalloc (2, sizeof (char*));
12668 name_list[n][0]=vcalloc (strlen (name)+1, sizeof (char));
12669 name_list[n][1]=vcalloc (100, sizeof (char));
12670 sprintf ( name_list[n][0], "%s", name);
12671 sprintf ( name_list[n][1], "%s_%d", tag,n+1);
12672 return name_list[n++][1];
12674 else if ( mode ==DECODE)
12678 if ( !(p=after_strstr (name, tag)))return name;
12681 sscanf (p, "_%d", &i);
12682 return name_list[i-1][0];
12687 printf_exit (EXIT_FAILURE, stderr,"Unknown Mode for Decode_name [FATAL:%s]", PROGRAM);
12693 FILE * display_sequences_names (Sequence *S, FILE *fp, int check_pdb_status, int print_templates)
12701 fprintf (fp,"\nERROR: NO SEQUENCE READ [FATAL:%s]\n", PROGRAM); myexit (EXIT_FAILURE);
12703 for ( a=0, max_len=0; a< S->nseq; a++)max_len=MAX(max_len, strlen (S->name[a]));
12704 fprintf ( fp, "\nINPUT SEQUENCES: %d SEQUENCES [%s]", S->nseq,(S->type)?S->type:"Unknown type");
12705 for ( a=0; a< S->nseq; a++)
12707 fprintf (fp, "\n Input File %-*s Seq %-*s Length %4d type %s",max_len,S->file[a], max_len,S->name[a],(int)strlen ( S->seq[a]), S->type);
12708 if (check_pdb_status)
12710 if ((r=seq_is_pdb_struc (S, a)))fprintf (fp, " Struct Yes PDBID %s", get_pdb_id(r));
12711 else fprintf (fp, " Struct No");
12713 if (is_pdb_struc (S->name[a])||is_pdb_struc (S->file[a]) )fprintf (fp, " Struct Yes");
12714 else fprintf (fp, " Struct No");
12717 else fprintf (fp, " Struct Unchecked");
12718 if ( print_templates)fp=display_sequence_templates (S, a, fp);
12722 fprintf ( fp, "\n");
12726 Sequence *add_file2file_list (char *name, Sequence *S)
12729 if (!S) S=declare_sequence (1,1,10);
12730 else S=realloc_sequence (S,S->nseq+1,0);S->nseq=0;
12732 sprintf ( S->name[S->nseq++], "%s", name);
12737 int parse_phecomp_data (char *in, char *out)
12739 static char *buffer;
12740 in=quick_find_token_in_file (in, "[EXPERIMENT HEADER]");
12741 while (fgets (buffer,fp,MAX_LINE_LENGTH));
12743 FILE * quick_find_token_in_file (FILE *fp, char *token)
12745 //returns fp pointing to the begining of the line FOLLOWING the line containing token
12746 static char *buffer;
12747 if (!line) line=vcalloc (MAX_LINE_LENGTH+1, sizeof (char));
12748 while (fgets (buffer,MAX_LINE_LENGTH, fp)!=NULL)
12749 if (strstr (buffer,token))return fp;
12754 int * file2cage (char *file, int cage)
12767 /******************************COPYRIGHT NOTICE*******************************/
12768 /*© Centro de Regulacio Genomica */
12770 /*Cedric Notredame */
12771 /*Fri Feb 18 08:27:45 CET 2011 - Revision 596. */
12772 /*All rights reserved.*/
12773 /*This file is part of T-COFFEE.*/
12775 /* T-COFFEE is free software; you can redistribute it and/or modify*/
12776 /* it under the terms of the GNU General Public License as published by*/
12777 /* the Free Software Foundation; either version 2 of the License, or*/
12778 /* (at your option) any later version.*/
12780 /* T-COFFEE is distributed in the hope that it will be useful,*/
12781 /* but WITHOUT ANY WARRANTY; without even the implied warranty of*/
12782 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the*/
12783 /* GNU General Public License for more details.*/
12785 /* You should have received a copy of the GNU General Public License*/
12786 /* along with Foobar; if not, write to the Free Software*/
12787 /* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA*/
12788 /*............................................... |*/
12789 /* If you need some more information*/
12790 /* cedric.notredame@europe.com*/
12791 /*............................................... |*/
12795 /******************************COPYRIGHT NOTICE*******************************/