7 #include "io_lib_header.h"
8 #include "util_lib_header.h"
9 #include "define_header.h"
11 #include "dp_lib_header.h"
15 23/06/00, Cedric Notredame
18 1-Content of the data structures.
19 2-Implementing your own function in pdb_align.
20 3-Using that function with T-Coffee (multiple Sequence Alignment).
21 4-Syntax rules as defined by Philipp Bucher (19/06/00).
22 5-Current Shortcomings
25 1-Content of the data structures
27 This file only contains a dummy function to help you create
28 your own matching potential function (Step 2 in the Notations RULES
30 int evaluate_match_score ( Constraint_list *CL, int A, int i, int B, int j)
32 returns a score, expected to be between -100 and 100, that corresponds to the matching of
35 Most needed parameters are included in the data structure CL,
36 This Data Structure is declared in util_constraint_list.h
37 The following, non exhaustive list explains the most common parameters
39 The neighborhood is computed using:
40 ((CL->T[A])->pdb_param)->maximum_distance as a radius for the Bubble
41 ((CL->T[A])->pdb_param)->n_excluded_nb are excluded around the central residue
42 i.e i-1 and i+1 for n_excluded_nb=1.
45 ((CL->T[A])->Bubble)->nb[i][0] --> Number of residues in the bubble around A_i
46 ((CL->T[A])->Bubble)->nb[i][k]=j --> Index of the kth residue in the bubble
47 Residues are sorted according to the Ca chain
48 ((CL->T[A])->Bubble)->d_nb[i][k]=d --> Distance between A_i and A_j equals d;
50 ((CL->T[A])->ca[i]->x -----------> Coordinates of the Ca A_i
56 ((CL->T[A])->len -----------> Length of Chain A.
57 ((CL->T[A])->n_atom -----------> n atoms in A.
60 Unspecified parameters can be passed from the command line:
62 align_pdb -extra_parameters=10, 10.3, 11, 12.4, my_file
64 The values of these parameters can be accessed in:
66 ((CL->T[A])->pdb_param)->n_extra_param=4
67 ((CL->T[A])->pdb_param)->extra_param[0]="10"
68 ((CL->T[A])->pdb_param)->extra_param[1]="10.3"
69 ((CL->T[A])->pdb_param)->extra_param[2]="11.6"
70 ((CL->T[A])->pdb_param)->extra_param[3]="my_file"
72 These parameters contain strings! To get the real values, in C, use atoi and atof:
73 atoi ( ((CL->T[A])->pdb_param)->extra_param[0])=10;
74 atof ( ((CL->T[A])->pdb_param)->extra_param[1])=10.3;
76 The maximum number of parameters is currently 1000...
80 2-Implementing your own function
82 all you need to do is to edit this file and recompile align_pdb.
83 There is no need to prototype any function.
85 10 functions holders exist, that correspond to the 10 dummy functions
86 declared in this file:
87 custom_pair_score_function1
88 custom_pair_score_function2
89 custom_pair_score_function3
90 custom_pair_score_function4
92 custom_pair_score_function10
94 Let us imagine, you want to use custom_pair_function1.
96 1-In CUSTOM_evaluate_for_struc.c, modify custom_pair_function1,
97 so that it computes the score you need.
99 2-If you need extra parameters, get them from ((CL->T[A])->pdb_param)->extra_param.
100 3-Recompile pdb_align:
104 4-run the program as follows:
106 align_pdb -in <struc1> <struc2> -hasch_mode=custom_pair_score_function1
107 -extra_param=10, 12, 0.4, matrix...
109 5-My advice for a first time: make a very simple dummy function that spits
110 out the content of extra_param.
112 6-Remember it is your responsability to control the number of extra parameters
113 and their proper order, and type. Do not forget to apply atoi and atof to the parameters
115 7-Remember that the modifications you made to CUSTOM_evaluate_for_sytructure
116 must be preserved by you!!! They may disappear if you update align_pdb, save them
117 appart as your own patch.
122 3-Using that function with T-Coffee (multiple Sequence Alignment).
124 1- setenv ALIGN_PDB_4_TCOFFEE <your version of align_pdb>
127 To do so, you will NOT NEED to recompile T-Coffee, simply type:
128 t_coffee -in <struc1> <struc2> ... custom1_align_pdb_pair
132 4-Syntax rules as defined by Philipp Bucher (19/06/00).
134 Proposed ascii text notation for align_pdb
136 First, let us summarize the align pdb algorithm in plain
139 Given are two protein structures A and B.
141 Step 1: For each residue in each structure extract
142 the local structural neighbourhood. A neighbourhood
143 is simply a subset of (usually non-consecutive)
144 residues from one of the structures.
146 Step 2: For all possible pairs of residues between structures
147 A and B, compute the optimal neighbourhood alignment
148 score. This score, which is also referred to as
149 local neighbourhood similarity (LNS) score indicates
150 whether two residues have similar local stuctural
153 Step 3: Generate one (or multiple) optimal structural alignment(s)
154 for A and B based on LNS scores plus some gap penalty
157 Now, some rules for ascii/email notation:
159 - Whenever possible use a style which fits on one line (because it
160 is painful to modify formulas that span over several lines). Example:
162 Use: ( a**2 + b**2 )**0.5
167 Introduce local variables/functions to split long expressions over
170 Score = Sum(0<i<I+1) Match(A_i,B_i) where
174 - Pseudosubscript notation for (multiply) indexed variables:
178 As a general rule, I propose that we always use lower case
179 letters for indices, and that the corresponding upper case letters
180 denote the number of indexed objects.
182 Index usage conventions: I propose that we use different indices
183 for different objects:
185 i a residue of structure A
186 j a residue of structure B
187 k a residue of a neighbourhood of structure A
188 l a residue of a neighbourhood of structure B
189 m a residue pair of a neignourhood alignment
190 n a residue pair of a structure alignment
192 Some examples, extensions, and additional conventions:
194 I # of residues in structure A
195 K_i # of residues of the neighbourhood of residue i of structure A
196 A_k(i) # the kth residue of neighbourhood i.
197 M_i,j # of residue pairs of an alignment of the neighbourhoods of
200 The pseudosubscript notation may not always be optimal in terms clarity.
201 We may occasionally use parenthesis, comma-separated susbscripts, etc.
202 instead, e.g. M(i,j) or M_ij.
204 The residues of the structures will be denoted:
209 This is for expressing general concepts only. It is of little practical
210 importance for the moment since we do not use all residue-related
211 structural information from pdb. Instead we use the C-alpha coordinates
213 C_1, C_2 ... C_I (for protein C)
214 D_1, D_2 ... D_J (for protein B)
216 for all compututations. The D_1 ... is admittedly not very intuitive
217 and I'm open for other suggestions. For the scalar components of the
218 C-alpha coordinates I propose that we use
220 C_1 = CX_1, CY_1, CZ_1 = (for example) 7.51, 1.24, 3.01
222 For the distance between two C-alpha atoms we write
228 [ (CX_1-CX_2)**2 + (CY_1-CY_2)**2 + (CZ_1-CZ_2)**2 ]**0.5
230 if I remember correctly from high school.
232 Back to the algorithm:
234 > Step 1: For each residue in each structure, extract
235 > the local structural neighbourhood. A neighbourhood
236 > is simply a subset of (usually non-consecutive)
237 > residues from the same structure.
239 The result is something like:
241 P(i) = P_1(i) .. P_k(i) .. P_K_i(i)
242 Q(i) = Q_1(j) .. Q_l(j) .. Q_L_j(j)
244 These are all ordered integer arrays. The P's and Q's indicate
245 residue positions in sequence space. For the C-alpha coordinates,
248 C(i) = C_1(i) .. C_k(i) .. C_K_i(i)
249 D(i) = D_1(j) .. D_l(j) .. D_L_i(j)
251 > Step 2: For all possible pairs of residues between structures
252 > A and B, compute the optimal neighbourhood alignment
253 > score. This score, which is also referred to as
254 > local neighbourhood similarity (NSL) score indicates
255 > whether two residues have similar local stuctural
258 We have to define a similarity score:
260 S(i,j) = function[A,B,P(i),Q(j)]
262 More specifically, S(i,j) is the score of an opimal alignment between
263 two subsets of C-alpha coordinates from A and B, defined by P(i) and Q(j).
264 We use the following notation for an alignment between two neighbourhoods.
266 R = (k_1,l_1) .. (k_m,l_m) .. (k_M, l_M)
268 This is pretty abstract and requires some explanation.
270 The alignment consists of M pairs of residues from two neighbourhoods.
271 The paired residues are numbered 1,2...K and 1,2...L, respectively.
272 Obviously M <= K,L. For K=9 and L=7, a possible alignment would
275 R = (1,2) , (2,3) , (5,4) , (6,5) , (9,7)
277 This alignment consists of five paired residues, the first
278 residue of neighbourhood P(i) is aligned with with the second residue
281 The score of an alignment Z(R) is a function that can be
282 defined in many different ways. But independently of its
285 S(i,j) = Z(R*,A,B,P(i),Q(j))
286 R* = argmax Z(R,A,B,P(i),Q(j))
288 This is just a complicated way of saying that the LNS score
289 S(i,j) is an optimal alignment score. A simple alignment
290 scoring function would be:
292 Z = Sum(m=1..M) [ 2 - |C_(k_m) - D_l_m)| ]
294 A more complex function could be the sum of the sums of "pair-weights",
295 "pair-connection-weights", and unpaired-residue-weights":
297 Z = Sum(m=1 .. M) [ PW (i,P_k_m,Q_l_m,C_k_m, D_l_m) ]
298 + Sum(m=2 .. M ) [ PCW(j,P_k_m,P_l_m,Q_k_m-1,Q_l_m-1,C_k_m,D_l_m,C_k_m-1,D_j_m-1 ]
299 + Sum(over k for all C_i(k) unpaired) UPRW [P_k, C_k ]
300 + Sum(over l for all C_i(l) unpaired) UPRW [Q_l, D_l)) ]
302 Here, the terms P_k_m ... denote sequence positions, the terms C_k_m ...
303 denote coordinates. i and j, the sequence position of the center residues
304 of the neighbourhoods under consideration) are included in the argument
305 lists of the functions because they are necessary to decide whether
306 a residue A_k_m occurs before or after the residue A_i in sequence space.
307 We don not want to align a residue A_k_m that occurs before A_i with
308 a residue B_j_l that occurs after B_j and vice-versa.
310 The LNS score could also be defined by a recursive equation system
311 defining a dynamic programming algorithm. However, I find the
312 above formulation more helpful for designing appropriate alignment
315 > Step 3: Generate one (or multiple) optimal structural alignment(s)r
316 > for A and B based on NLS scores plus some gap penalty
319 This is now pretty simple. We use essentially the same notation as
320 for the neighbourhood alignments.
322 R = (i_1,j_1) .. (i_n,j_n) .. (i_N, j_N)
327 The alignment scoring functing X is the sum of the LNS scores
328 of the pairs minus some gap penalties.
331 5-Current Shortcomings
333 At present, it is impossible to use the extra_param flag with T-Coffee. This means that
334 the actual values of your parameters must be HARD-CODED within the custom_pair_score_function
337 On request, I will implement a solution to solve that problem.
340 For any enquiry, please send a mail to:
341 cedric.notredame@europe.com
351 int custom_pair_score_function1 (Constraint_list *CL, int s1, int r1, int s2, int r2)
357 fp=vfopen ( "test_file", "w");
358 for ( a=0; a< ((CL->T[0])->pdb_param)->n_extra_param; a++)
359 fprintf (fp, "\n\t%s", ((CL->T[0])->pdb_param)->extra_param[a]);
361 fprintf ( fp, "\nTEST OK");
368 int custom_pair_score_function2 (Constraint_list *CL, int s1, int r1, int s2, int r2)
375 int custom_pair_score_function3 (Constraint_list *CL, int s1, int r1, int s2, int r2)
382 int custom_pair_score_function4 (Constraint_list *CL, int s1, int r1, int s2, int r2)
389 int custom_pair_score_function5 (Constraint_list *CL, int s1, int r1, int s2, int r2)
396 int custom_pair_score_function6 (Constraint_list *CL, int s1, int r1, int s2, int r2)
403 int custom_pair_score_function7 (Constraint_list *CL, int s1, int r1, int s2, int r2)
410 int custom_pair_score_function8 (Constraint_list *CL, int s1, int r1, int s2, int r2)
418 int custom_pair_score_function9 (Constraint_list *CL, int s1, int r1, int s2, int r2)
425 int custom_pair_score_function10 (Constraint_list *CL, int s1, int r1, int s2, int r2)
432 /*********************************COPYRIGHT NOTICE**********************************/
433 /*© Centro de Regulacio Genomica */
435 /*Cedric Notredame */
436 /*Tue Oct 27 10:12:26 WEST 2009. */
437 /*All rights reserved.*/
438 /*This file is part of T-COFFEE.*/
440 /* T-COFFEE is free software; you can redistribute it and/or modify*/
441 /* it under the terms of the GNU General Public License as published by*/
442 /* the Free Software Foundation; either version 2 of the License, or*/
443 /* (at your option) any later version.*/
445 /* T-COFFEE is distributed in the hope that it will be useful,*/
446 /* but WITHOUT ANY WARRANTY; without even the implied warranty of*/
447 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the*/
448 /* GNU General Public License for more details.*/
450 /* You should have received a copy of the GNU General Public License*/
451 /* along with Foobar; if not, write to the Free Software*/
452 /* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA*/
453 /*............................................... |*/
454 /* If you need some more information*/
455 /* cedric.notredame@europe.com*/
456 /*............................................... |*/
460 /*********************************COPYRIGHT NOTICE**********************************/