7 #include "io_lib_header.h"
8 #include "util_lib_header.h"
9 #include "define_header.h"
10 #include "dp_lib_header.h"
12 /******************************************************************/
16 /******************************************************************/
17 Constraint_list *prepare_cl_for_moca ( Constraint_list *CL)
24 /*Prepare the constraint list*/
26 CL->get_dp_cost=moca_slow_get_dp_cost;
27 CL->evaluate_residue_pair=moca_residue_pair_extended_list;
29 /*Prepare the moca parameters*/
30 (CL->moca)->evaluate_domain=evaluate_moca_domain;
31 (CL->moca)->cache_cl_with_domain=cache_cl_with_moca_domain;
32 (CL->moca)->make_nol_aln=make_moca_nol_aln;
34 /*Prepare the packing of the sequences*/
35 for ( a=0, b=1; a< (CL->S)->nseq; a++)b+=strlen ( (CL->S)->seq[a])+1;
37 seq =declare_char ( 1,b+1);
38 name=declare_char( 1,30);
39 CL->packed_seq_lu =declare_int ( b, 2);
42 for (tot_l=1,a=0; a< (CL->S)->nseq; a++)
44 strcat (seq[0], (CL->S)->seq[a]);
46 l=strlen((CL->S)->seq[a]);
47 for ( c=1; c<= l; c++, tot_l++)
49 CL->packed_seq_lu[tot_l][0]=a;
50 CL->packed_seq_lu[tot_l][1]=c;
52 CL->packed_seq_lu[tot_l++][0]=UNDEFINED;
54 sprintf ( name[0], "catseq");
55 NS=fill_sequence_struc(1, seq, name);
56 CL->S=add_sequence (NS, CL->S, 0);
59 free_sequence (NS, NS->nseq);
65 Alignment ** moca_aln ( Constraint_list *CL)
68 function documentation: start
70 Alignment ** moca_aln ( Constraint_list *CL)
72 This function inputs CL and outputs a series of local multiple alignments
73 contained in aln_list;
75 The terminator of aln_list is set to NULL;
77 function documentation: end
81 static int max_n_domains=1000;
88 aln_list=vcalloc (max_n_domains, sizeof (Alignment *));
89 if ((CL->moca)->moca_interactive)aln_list[n_domains++]=extract_domain ( CL);
92 while ( (aln_list[n_domains++]=extract_domain ( CL))!=NULL)
94 if ((CL->moca)->moca_len)break;
95 if ( n_domains==max_n_domains)
98 aln_list=vrealloc (aln_list, max_n_domains*sizeof (Alignment*));
105 Alignment * extract_domain ( Constraint_list *CL)
108 function documentation: start
109 Alignment * extract_domain ( Constraint_list *CL)
111 given a CL, this function extracts the next best scoring local multiple alignment
112 It returns a CL where the aligned residues have been indicated in (CL->moca)->forbiden_residues;
114 the local alignment is extracted with the dp function indicated by
115 CL->dp_mode: (gotoh_sw_pair_wise)
117 CL->get_dp_cost=slow_get_dp_cost;
118 CL->evaluate_residue_pair=sw_residue_pair_extended_list;
120 (CL->moca)->evaluate_domain=evaluate_moca_domain;
122 (CL->moca)->cache_cl_with_domain=cache_cl_with_moca_domain;
123 Domain post processing:
124 (CL->moca)->make_nol_aln=make_moca_nol_aln;
125 function documentation: end
127 int min_start, max_start, start,min_len, max_len, len, score;
130 Alignment *RESULT=NULL;
136 /*CASE 1: Non Automatic Domain Extraction*/
137 if ((CL->moca)->moca_interactive)
139 return interactive_domain_extraction (CL);
141 else if ((CL->moca)->moca_len)
143 while ((C=extract_domain_with_coordinates (C,(CL->moca)->moca_start,(CL->moca)->moca_len,CL))->nseq==0)(CL->moca)->moca_scale=(CL->moca)->moca_scale*0.9;
144 RESULT=copy_aln ( C, RESULT);
145 unpack_seq_aln (RESULT, CL);
146 output_format_aln ("mocca_aln",RESULT,EA=fast_coffee_evaluate_output(RESULT, CL),"stdout");
151 else if ( !(CL->moca)->moca_len)
153 analyse_sequence (CL);
154 myexit (EXIT_FAILURE);
157 /*CASE 2: Automatic Domain Extraction: Find Coordinates*/
164 max_start=strlen ((CL->S)->seq[0]);
166 max_len=strlen ((CL->S)->seq[0]);
168 C=extract_domain_with_coordinates (C,13,30,CL);
172 (CL->moca)->moca_scale=-180;
173 C=add_seq2aln (CL,C, CL->S);
176 (CL->moca)->moca_scale=-160;
177 C=add_seq2aln (CL,C, CL->S);
180 myexit (EXIT_FAILURE);
184 C=approximate_domain (min_start,max_start,step,min_len,max_len, step,&start, &len, &score, CL);
185 min_start=start-step;
186 max_start=start+step;
192 C=extract_domain_with_coordinates (C,start-10, len+20,CL);
196 myexit (EXIT_FAILURE);
201 Alignment * interactive_domain_extraction ( Constraint_list *CL)
213 char last_start[100];
214 char out_format[100];
215 Alignment *RESULT=NULL;
216 Alignment *PREVIOUS=NULL;
223 choice=vcalloc ( 100, sizeof (char));
224 parameters=declare_int (10000, 4);
226 parameters[0][START]=(CL->moca)->moca_start;
227 parameters[0][LEN]= (CL->moca)->moca_len;
228 parameters[0][SCALE]=(CL->moca)->moca_scale;
229 parameters[0][GOPP]=CL->gop;
231 sprintf ( last_start, "%d", (CL->moca)->moca_start);
232 sprintf ( out_format, "mocca_aln");
234 print_moca_interactive_choices ();
235 while ( !strm4 (choice, "Q","X", "q", "x" ))
239 if (c=='b' || c=='B')
241 iteration-=atoi(choice+1)+1;
243 if (iteration<0)iteration=1;
248 parameters[iteration][START]=parameters[iteration-1][START];
249 parameters[iteration][LEN]=parameters[iteration-1][LEN];
250 parameters[iteration][SCALE]=parameters[iteration-1][SCALE];
251 parameters[iteration][GOPP]=parameters[iteration-1][GOPP];
253 if ( c=='>')parameters[iteration][LEN]=atoi(choice+1);
256 sprintf ( last_start, "%s", choice);
257 parameters[iteration][START]=0;
258 s=strrchr(choice, ':');
262 parameters[iteration][START]=atoi(choice+1);
269 if((index=name_is_in_list (choice+1,(CL->S)->name,(CL->S)->nseq,100))==-1)
271 fprintf ( stderr, "\n\tERROR: %s NOT in Sequence Set",choice+1);
275 for ( a=0; a< index; a++)
277 parameters[iteration][START]+=(CL->S)->len[a]+1;
279 parameters[iteration][START]+=atoi(s+1)-1;
283 else if ( c=='C'||c=='c')parameters[iteration][SCALE]=atoi(choice+1);
284 else if ( c=='G'||c=='g')
286 parameters[iteration][GOPP]=atoi(choice+1);
287 CL->gop=parameters[iteration][GOPP];
289 else if ( c=='F'||c=='f')
291 sprintf ( out_format, "%s", choice+1);
293 else if ( c=='S'||c=='s')
295 if (choice[1]=='\0')sprintf ( choice, "default.domain_aln.%d", iteration);
296 output_format_aln (out_format,RESULT,EA=fast_coffee_evaluate_output(RESULT, CL),choice+1);
297 fprintf (stderr, "\tOutput file [%15s] in [%10s] format\n",choice+1,out_format);
302 if ( parameters[iteration][SCALE]>0)
304 fprintf ( stderr, "\nWARNING: THRESHOLD RESET to 0");
305 parameters[iteration][SCALE]=0;
308 (CL->moca)->moca_scale=parameters[iteration][SCALE];
309 CL->gop=parameters[iteration][GOPP];
311 C=extract_domain_with_coordinates (C,parameters[iteration][START],parameters[iteration][LEN],CL);
315 fprintf ( stderr, "\nERROR: ILLEGAL COORDINATES! SEQUENCE BOUNDARY CROSSED\n");
316 for ( b=1,a=0; a< (CL->S)->nseq-1; a++)
319 fprintf ( stderr, "\n\t%15s=> Abs:[%d %d] Rel:[0 %d]", (CL->S)->name[a],b, b+(CL->S)->len[a]-1,(CL->S)->len[a]);
322 fprintf ( stderr, "\n");
324 else if (parameters[iteration][START]==0 && parameters[iteration][LEN]==0)
326 fprintf ( stderr, "\n\tEnter the following parameters:\n\n\t\tSTART value: |x [Return]\n\t\tLENgth value: >y [Return]\n\t\ttype [Return]\n\n");
327 fprintf ( stderr, "\n\n\tSTART is measured on the total length of the concatenated sequences\n\tx and y are positive integers\n\n");
330 else if ( C->nseq==0)
332 fprintf ( stderr, "\nNO MATCH FOUND: LOWER THE SCALE (C)\n");
336 RESULT=copy_aln ( C, RESULT);
337 unpack_seq_aln (RESULT, CL);
338 RESULT->output_res_num=1;
340 output_format_aln (out_format,RESULT,EA=fast_coffee_evaluate_output(RESULT, CL),"stdout");
342 PREVIOUS=copy_aln ( RESULT, PREVIOUS);
344 print_moca_interactive_choices ();
349 fprintf ( stderr, "\t[ITERATION %3d][START=%s][LEN=%3d][GOPP=%3d][SCALE=%4d]\t",iteration,last_start,parameters[iteration][LEN],parameters[iteration][GOPP],parameters[iteration][SCALE]);
351 fprintf ( stderr, "Your Choice: ");
352 while ( (c=fgetc(stdin))!='\n')choice[a++]=c;
357 if (!RESULT)myexit(EXIT_SUCCESS);
358 if ( RESULT)RESULT->output_res_num=0;
362 int print_moca_interactive_choices ()
364 fprintf ( stderr, "\n**************************************************************");
365 fprintf ( stderr, "\n******************** MOCCA: %s ***********",VERSION);
366 fprintf ( stderr, "\n**************************************************************");
369 fprintf ( stderr, "\nMENU: Type Flag[number] and Return: ex |10");
371 fprintf ( stderr, "\n\t|x -->Set the START to x");
372 fprintf ( stderr, "\n\t 100 start=100 on concatenated sequences");
373 fprintf ( stderr, "\n\t human:100 start=100 on human sequence");
374 fprintf ( stderr, "\n\t>x -->Set the LEN to x");
375 fprintf ( stderr, "\n\tGx -->Set the Gap Opening Penalty to x");
376 fprintf ( stderr, "\n\tCx -->Set the sCale to x");
377 fprintf ( stderr, "\n\tSname -->Save the Alignment ");
378 fprintf ( stderr, "\n\tFformat -->Save the Alignment Format");
380 fprintf ( stderr, "\n\treturn -->Compute the Alignment");
382 fprintf ( stderr, "\n\tX -->eXit\n\n");
387 Alignment * approximate_domain ( int min_start, int max_start, int step_start,int min_len, int max_len, int step_len, int *best_start, int *best_len, int *best_score, Constraint_list *CL)
394 /*1 Extract the first*/
395 best_score[0]=UNDEFINED;
396 best_start[0]=min_start;
399 for (start=min_start; start< max_start; start+=step_start)
401 for ( len=min_len; len<max_len; len+=step_len)
403 C=extract_domain_with_coordinates (C,start,len,CL);
404 if ( C==NULL)continue;
405 score=((CL->moca)->evaluate_domain)(C, CL);
406 fprintf ( stderr, "\nSTART=%d LEN=%3d SCORE=%5d [%d]",start,len,score, C->nseq);
409 if ( best_score[0]==UNDEFINED)best_score[0]=score;
410 if ( score>best_score[0])
419 C=extract_domain_with_coordinates (C,best_start[0], best_len[0],CL);
423 int measure_domain_length ( Constraint_list *CL,Alignment *IN, int start, int min_len, int max_len, int step)
426 int score, best_score,best_len,a, b, l;
427 int *score_matrix, *len_matrix;
430 score_matrix=vcalloc ( max_len, sizeof (int));
431 len_matrix=vcalloc ( max_len, sizeof (int));
434 l=strlen ( (CL->S)->seq[0]);
436 min_len=MAX(0, min_len);
437 min_len=MIN(l-start, min_len);
439 if ( !IN)C=extract_domain_with_coordinates (C,start,min_len, CL);
444 for ( a=0; a< C->nseq; a++)C->seq_al[a][min_len]='\0';
445 C=add_seq2aln (CL,C, CL->S);
448 best_score= score=((CL->moca)->evaluate_domain)(C, CL);
451 min_len=MAX(0, min_len);
452 for ( best_len=best_val=n_val=0,b=min_len; b<max_len && (start+b)<l; b+=step, n_val++)
454 if ( !IN)C=extract_domain_with_coordinates (C,start, b, CL);
459 for ( a=0; a< C->nseq; a++)C->seq_al[a][b]='\0';
460 C=add_seq2aln (CL,C, CL->S);
462 if ( C->len_aln>0 )score=((CL->moca)->evaluate_domain)(C, CL);
465 if ( score< -3000)break;
467 fprintf ( stderr, "\n\t%d %d=>%d (%d, %d)[%d]",start, b, score, C->nseq, C->len_aln, step);
468 score_matrix[n_val]=score;
469 len_matrix [n_val]=b;
470 if ( score>best_score)
479 for ( a=best_val; a<n_val;a++)
481 if (score_matrix[a]>best_score/2)best_len=len_matrix[a];
484 vfree ( score_matrix);
490 Alignment *extract_domain_with_coordinates ( Alignment *RESULT,int start, int len, Constraint_list *CL)
494 Alignment *SEQ_DOMAIN=NULL;
500 /*ADJUST THE DIRECTION OF THE DOMAIN: len<0:left and len>0:right*/
509 /*CHECK THAT THE BOUNDARY CONDITIONS*/
512 if (start<0 || (!CL->packed_seq_lu && (start+len)>strlen((CL->S)->seq[0])) ||(CL->packed_seq_lu && (start+len)>strlen((CL->S)->seq[(CL->S)->nseq-1])) )return NULL;
515 for ( a=start; a< start+len; a++)
517 if ((CL->moca)->forbiden_residues && (CL->moca)->forbiden_residues[0][a+1]==UNDEFINED)
519 fprintf ( stderr, "*");
525 /*EXTRACT THE DOMAIN*/
527 SEQ_DOMAIN=add_seq2aln (CL,SEQ_DOMAIN, CL->S);
528 buf=extract_char (SEQ_DOMAIN->seq_al[0], start, len);
530 for (a=0; a<len; a++)if ( buf[a]=='X'){free_aln(SEQ_DOMAIN);return NULL;}
531 sprintf ( SEQ_DOMAIN->seq_al[0], "%s", buf);
532 SEQ_DOMAIN->order[0][1]=start;
533 SEQ_DOMAIN=add_seq2aln (CL,SEQ_DOMAIN, CL->S);
543 int get_starting_point ( Constraint_list *CL)
558 l=strlen ( (CL->S)->seq[0]);
560 seq=declare_int ( l, 2);
562 for ( a=0; a<CL->ne;a++)
564 entry=extract_entry (entry, a, CL);
566 seq[entry[R1]][1]=entry[R1];
567 seq[entry[R2]][1]=entry[R2];
568 if ((CL->moca) && (CL->moca)->forbiden_residues && ((CL->moca)->forbiden_residues[0][entry[R1]]==UNDEFINED||(CL->moca)->forbiden_residues[0][entry[R2]]==UNDEFINED ))continue;
571 seq[entry[R1]][0]+=entry[MISC];
572 seq[entry[R2]][0]+=entry[MISC];
576 sort_int_inv ( seq, 2, 0, 0, l-1);
577 fprintf ( stderr, "\nStart=%d %d", seq[0][1], seq[0][0]);
579 CL=index_res_constraint_list (CL,WE);
588 int * analyse_sequence ( Constraint_list *CL)
591 int len, start, n_dots;
592 int left, right, tw, r, w;
593 int best_tw, best_start=0, best_len=0;
597 CL=index_res_constraint_list ( CL, WE);
598 l=strlen (( CL->S)->seq[0]);
600 for ( best_tw=UNDEFINED,start=0; start<l; start++)
602 for ( len=10; len< max_len && start+len<l; len++)
606 for (tw=0, p=0; p<len; p++)
608 n_dots=CL->residue_index[0][p+start+1][0];
610 for ( a=1; a<n_dots; a+=3)
613 r=CL->residue_index[0][p+start+1][a+1];
614 w=CL->residue_index[0][p+start+1][a+2];
616 if (r<left || r>right)tw+=w;
620 if ( tw> best_tw || best_tw==UNDEFINED)
628 fprintf ( stderr, "\nStart=%d Len=%d", best_start, best_len);
632 /*********************************COPYRIGHT NOTICE**********************************/
633 /*© Centre National de la Recherche Scientifique (CNRS) */
635 /*Cedric Notredame */
636 /*Fri Aug 8 19:03:27 MDT 2003. */
637 /*All rights reserved.*/
639 /* This file is an integral part of the */
640 /* T-COFFEE Software. */
641 /* Its content is protected and all */
642 /* the conditions mentioned in the licensing */
643 /* agreement of the software apply to this file.*/
644 /*............................................... |*/
645 /* If you need some more information, or if you */
646 /* wish to obtain a full license, please contact: */
647 /* cedric.notredame@europe.com*/
648 /*............................................... |*/
652 /*********************************COPYRIGHT NOTICE**********************************/
653 /*********************************COPYRIGHT NOTICE**********************************/
654 /*© Centro de Regulacio Genomica */
656 /*Cedric Notredame */
657 /*Tue Oct 27 10:12:26 WEST 2009. */
658 /*All rights reserved.*/
659 /*This file is part of T-COFFEE.*/
661 /* T-COFFEE is free software; you can redistribute it and/or modify*/
662 /* it under the terms of the GNU General Public License as published by*/
663 /* the Free Software Foundation; either version 2 of the License, or*/
664 /* (at your option) any later version.*/
666 /* T-COFFEE is distributed in the hope that it will be useful,*/
667 /* but WITHOUT ANY WARRANTY; without even the implied warranty of*/
668 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the*/
669 /* GNU General Public License for more details.*/
671 /* You should have received a copy of the GNU General Public License*/
672 /* along with Foobar; if not, write to the Free Software*/
673 /* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA*/
674 /*............................................... |*/
675 /* If you need some more information*/
676 /* cedric.notredame@europe.com*/
677 /*............................................... |*/
681 /*********************************COPYRIGHT NOTICE**********************************/