binaries/src/tcoffee/t_coffee_source/CUSTOM_evaluate_for_struc.c

   1 #include <stdio.h>
   2 #include <stdlib.h>
   3 #include <math.h>
   4 #include <stdarg.h>
   5
   6
   7 #include "io_lib_header.h"
   8 #include "util_lib_header.h"
   9 #include "define_header.h"
  10
  11 #include "dp_lib_header.h"
  12
  13
  14 /*
  15   23/06/00, Cedric Notredame
  16
  17
  18   1-Content of the data structures.
  19   2-Implementing your own function in pdb_align.
  20   3-Using that function with T-Coffee (multiple Sequence Alignment).
  21   4-Syntax rules as defined by Philipp Bucher (19/06/00).
  22   5-Current Shortcomings
  23   6-Enquiries.
  24
  25   1-Content of the data structures
  26
  27    This file only contains a dummy function to help you create
  28    your own matching potential function (Step 2 in the Notations RULES
  29
  30    int evaluate_match_score ( Constraint_list *CL, int A, int i, int B, int j)
  31
  32    returns a score, expected to be between -100 and 100, that corresponds to the matching of
  33    A_i with B_j.
  34
  35    Most needed parameters are included in the data structure CL,
  36    This Data Structure is declared in util_constraint_list.h
  37    The following, non exhaustive list explains the most common parameters
  38
  39    The neighborhood is computed using:
  40    ((CL->T[A])->pdb_param)->maximum_distance as a radius for the Bubble
  41    ((CL->T[A])->pdb_param)->n_excluded_nb are excluded around the central residue
  42                            i.e i-1 and i+1 for n_excluded_nb=1.
  43
  44
  45    ((CL->T[A])->Bubble)->nb[i][0]     --> Number of residues in the bubble around A_i
  46    ((CL->T[A])->Bubble)->nb[i][k]=j   --> Index of the kth residue in the bubble
  47                                          Residues are sorted according to the Ca chain
  48    ((CL->T[A])->Bubble)->d_nb[i][k]=d --> Distance between A_i and A_j equals d;
  49
  50    ((CL->T[A])->ca[i]->x -----------> Coordinates of the Ca A_i
  51    ((CL->T[A])->ca[i]->y
  52    ((CL->T[A])->ca[i]->z
  53
  54
  55
  56    ((CL->T[A])->len      -----------> Length of Chain A.
  57    ((CL->T[A])->n_atom   -----------> n atoms in A.
  58
  59
  60    Unspecified parameters can be passed from the command line:
  61
  62            align_pdb -extra_parameters=10, 10.3, 11, 12.4, my_file
  63
  64    The values of these parameters can be accessed in:
  65
  66     ((CL->T[A])->pdb_param)->n_extra_param=4
  67     ((CL->T[A])->pdb_param)->extra_param[0]="10"
  68     ((CL->T[A])->pdb_param)->extra_param[1]="10.3"
  69     ((CL->T[A])->pdb_param)->extra_param[2]="11.6"
  70     ((CL->T[A])->pdb_param)->extra_param[3]="my_file"
  71
  72    These parameters contain strings! To get the real values, in C, use atoi and atof:
  73     atoi ( ((CL->T[A])->pdb_param)->extra_param[0])=10;
  74     atof ( ((CL->T[A])->pdb_param)->extra_param[1])=10.3;
  75
  76     The maximum number of parameters is currently 1000...
  77
  78
  79
  80   2-Implementing your own function
  81
  82     all you need to do is to edit this file and recompile align_pdb.
  83     There is no need to prototype any function.
  84
  85     10 functions holders exist, that correspond to the 10 dummy functions
  86     declared in this file:
  87                   custom_pair_score_function1
  88                   custom_pair_score_function2
  89                   custom_pair_score_function3
  90                   custom_pair_score_function4
  91                   .....
  92                   custom_pair_score_function10
  93
  94     Let us imagine, you want to use custom_pair_function1.
  95
  96              1-In CUSTOM_evaluate_for_struc.c, modify custom_pair_function1,
  97                so that it computes the score you need.
  98
  99              2-If you need extra parameters, get them from ((CL->T[A])->pdb_param)->extra_param.
 100              3-Recompile pdb_align:
 101                        -put it in your bin
 102                        -rehash or whatever
 103
 104              4-run the program as follows:
 105
 106              align_pdb -in <struc1> <struc2> -hasch_mode=custom_pair_score_function1
 107                        -extra_param=10, 12, 0.4, matrix...
 108
 109              5-My advice for a first time: make a very simple dummy function that spits
 110              out the content of extra_param.
 111
 112              6-Remember it is your responsability to control the number of extra parameters
 113              and their proper order, and type. Do not forget to apply atoi and atof to the parameters
 114
 115              7-Remember that the modifications you made to CUSTOM_evaluate_for_sytructure
 116              must be preserved by you!!! They may disappear if you update align_pdb, save them
 117              appart as your own patch.
 118
 119
 120
 121
 122 3-Using that function with T-Coffee (multiple Sequence Alignment).
 123
 124   1- setenv ALIGN_PDB_4_TCOFFEE  <your version of align_pdb>
 125
 126   2- run t_coffee
 127   To do so, you will NOT NEED to recompile T-Coffee, simply type:
 128             t_coffee -in <struc1> <struc2> ... custom1_align_pdb_pair
 129
 130
 131
 132 4-Syntax rules as defined by Philipp Bucher (19/06/00).
 133
 134   Proposed ascii text notation for align_pdb
 135
 136      First, let us summarize the align pdb algorithm in plain
 137      english:
 138
 139      Given are two protein structures A and B.
 140
 141         Step 1: For each residue in each structure extract
 142            the local structural neighbourhood. A neighbourhood
 143            is simply a subset of (usually non-consecutive)
 144            residues from one of the structures.
 145
 146         Step 2: For all possible pairs of residues between structures
 147            A and B, compute the optimal neighbourhood alignment
 148            score. This score, which is also referred to as
 149            local neighbourhood similarity (LNS) score indicates
 150            whether two residues have similar local stuctural
 151            environemnts.
 152
 153         Step 3: Generate one (or multiple) optimal structural alignment(s)
 154            for A and B based on LNS scores plus some gap penalty
 155            function.
 156
 157      Now, some rules for ascii/email notation:
 158
 159       - Whenever possible use a style which fits on one line (because it
 160         is painful to modify formulas that span over several lines). Example:
 161
 162         Use: ( a**2 + b**2 )**0.5
 163                       ________
 164                      |  2    2
 165         instead of: \| a  + b
 166
 167         Introduce local variables/functions to split long expressions over
 168         several lines, e.g.
 169
 170         Score = Sum(0<i<I+1) Match(A_i,B_i) where
 171
 172                 Match(A,B) = ..
 173
 174       - Pseudosubscript notation for (multiply) indexed variables:
 175
 176         A_i A_j_i
 177
 178         As a general rule, I propose that we always use lower case
 179         letters for indices, and that the corresponding upper case letters
 180         denote the number of indexed objects.
 181
 182      Index usage conventions: I propose that we use different indices
 183      for different objects:
 184
 185         i  a residue of structure A
 186         j  a residue of structure B
 187         k  a residue of a neighbourhood of structure A
 188         l  a residue of a neighbourhood of structure B
 189         m  a residue pair of a neignourhood alignment
 190         n  a residue pair of a structure alignment
 191
 192      Some examples, extensions, and additional conventions:
 193
 194         I      # of residues in structure A
 195         K_i    # of residues of the neighbourhood of residue i of  structure A
 196         A_k(i) # the kth residue of neighbourhood i.
 197         M_i,j  # of residue pairs of an alignment of the neighbourhoods of
 198                residues i and j.
 199
 200      The pseudosubscript notation may not always be optimal in terms clarity.
 201      We may occasionally use parenthesis, comma-separated susbscripts, etc.
 202      instead, e.g. M(i,j) or M_ij.
 203
 204      The residues of the structures will be denoted:
 205
 206         A_1, A_2 ... A_I
 207         B_1, B_2 ... B_J
 208
 209      This is for expressing general concepts only. It is of little practical
 210      importance for the moment since we do not use all residue-related
 211      structural information from pdb. Instead we use the C-alpha coordinates
 212
 213         C_1, C_2 ... C_I  (for protein C)
 214         D_1, D_2 ... D_J  (for protein B)
 215
 216      for all compututations. The D_1 ...  is admittedly not very intuitive
 217      and I'm open for other suggestions. For the scalar components of the
 218      C-alpha coordinates I propose that we use
 219
 220         C_1 = CX_1, CY_1, CZ_1 = (for example) 7.51, 1.24, 3.01
 221
 222      For the distance between two C-alpha atoms we write
 223
 224         |C_1-C_2|
 225
 226      which equals
 227
 228         [ (CX_1-CX_2)**2 + (CY_1-CY_2)**2 + (CZ_1-CZ_2)**2 ]**0.5
 229
 230      if I remember correctly from high school.
 231
 232      Back to the algorithm:
 233
 234      > Step 1: For each residue in each structure, extract
 235      >         the local structural neighbourhood. A neighbourhood
 236      >         is simply a subset of (usually non-consecutive)
 237      >         residues from the same structure.
 238
 239      The result is something like:
 240
 241        P(i) = P_1(i) .. P_k(i) .. P_K_i(i)
 242        Q(i) = Q_1(j) .. Q_l(j) .. Q_L_j(j)
 243
 244      These are all ordered integer arrays. The P's and Q's indicate
 245      residue positions in sequence space. For the C-alpha coordinates,
 246      we use:
 247
 248        C(i) = C_1(i) .. C_k(i) .. C_K_i(i)
 249        D(i) = D_1(j) .. D_l(j) .. D_L_i(j)
 250
 251      > Step 2: For all possible pairs of residues between structures
 252      >         A and B, compute the optimal neighbourhood alignment
 253      >         score. This score, which is also referred to as
 254      >         local neighbourhood similarity (NSL) score indicates
 255      >         whether two residues have similar local stuctural
 256      >         environemnts.
 257
 258      We have to define a similarity score:
 259
 260        S(i,j) = function[A,B,P(i),Q(j)]
 261
 262      More specifically, S(i,j) is the score of an opimal alignment between
 263      two subsets of C-alpha coordinates from A and B, defined by P(i) and Q(j).
 264      We use the following notation for an alignment between two neighbourhoods.
 265
 266        R = (k_1,l_1) .. (k_m,l_m) .. (k_M, l_M)
 267
 268      This is pretty abstract and requires some explanation.
 269
 270      The alignment consists of M pairs of residues from two neighbourhoods.
 271      The paired residues are numbered 1,2...K and 1,2...L, respectively.
 272      Obviously M <= K,L. For K=9 and L=7, a possible alignment would
 273      look as follows:
 274
 275        R = (1,2) , (2,3) , (5,4) , (6,5) , (9,7)
 276
 277      This alignment consists of five paired residues, the first
 278      residue of neighbourhood P(i) is aligned with with the second residue
 279      of Q(j), etc.
 280
 281      The score of an alignment Z(R) is a function that can be
 282      defined in many different ways. But independently of its
 283      definition:
 284
 285         S(i,j) = Z(R*,A,B,P(i),Q(j))
 286         R* = argmax Z(R,A,B,P(i),Q(j))
 287
 288      This is just a complicated way of saying that the LNS score
 289      S(i,j) is an optimal alignment score. A simple alignment
 290      scoring function would be:
 291
 292         Z = Sum(m=1..M) [ 2 - |C_(k_m) - D_l_m)| ]
 293
 294      A more complex function could be the sum of the sums of "pair-weights",
 295      "pair-connection-weights", and unpaired-residue-weights":
 296
 297         Z =   Sum(m=1 .. M)  [ PW (i,P_k_m,Q_l_m,C_k_m, D_l_m) ]
 298             + Sum(m=2 .. M ) [ PCW(j,P_k_m,P_l_m,Q_k_m-1,Q_l_m-1,C_k_m,D_l_m,C_k_m-1,D_j_m-1 ]
 299             + Sum(over k for all C_i(k) unpaired) UPRW [P_k, C_k ]
 300             + Sum(over l for all C_i(l) unpaired) UPRW [Q_l, D_l)) ]
 301
 302      Here, the terms P_k_m ... denote sequence positions, the terms C_k_m ...
 303      denote coordinates. i and j, the sequence position of the center residues
 304      of the neighbourhoods under consideration) are included in the argument
 305      lists of the functions because they are necessary to decide whether
 306      a residue A_k_m occurs before or after the residue A_i in sequence space.
 307      We don not want to align a residue A_k_m that occurs before A_i with
 308      a residue B_j_l that occurs after B_j and vice-versa.
 309
 310      The LNS score could also be defined by a recursive equation system
 311      defining a dynamic programming algorithm. However, I find the
 312      above formulation more helpful for designing appropriate alignment
 313      scoring functions.
 314
 315      >       Step 3: Generate one (or multiple) optimal structural alignment(s)r
 316      >          for A and B based on NLS scores plus some gap penalty
 317      >          function.
 318
 319      This is now pretty simple. We use essentially the same notation as
 320      for the neighbourhood alignments.
 321
 322        R = (i_1,j_1) .. (i_n,j_n) .. (i_N, j_N)
 323
 324        X* = X(R*,A,B)
 325        R* = argmax X(R,A,B)
 326
 327      The alignment scoring functing X is the sum of the LNS scores
 328      of the pairs minus some gap penalties.
 329
 330
 331 5-Current Shortcomings
 332
 333    At present, it is impossible to use the extra_param flag with T-Coffee. This means that
 334    the actual values of your parameters must be HARD-CODED within the custom_pair_score_function
 335    you are using.
 336
 337    On request, I will implement a solution to solve that problem.
 338
 339 6-Contact
 340   For any enquiry, please send a mail to:
 341      cedric.notredame@europe.com
 342  */
 343
 344
 345
 346
 347
 348
 349
 350
 351 int custom_pair_score_function1 (Constraint_list *CL, int s1, int r1, int s2, int r2)
 352    {
 353      int score=0;
 354      int a;
 355      FILE *fp;
 356
 357      fp=vfopen ( "test_file", "w");
 358      for ( a=0; a< ((CL->T[0])->pdb_param)->n_extra_param; a++)
 359        fprintf (fp, "\n\t%s", ((CL->T[0])->pdb_param)->extra_param[a]);
 360
 361      fprintf ( fp, "\nTEST OK");
 362      vfclose ( fp);
 363      exit (1);
 364
 365      return score;
 366
 367    }
 368 int custom_pair_score_function2 (Constraint_list *CL, int s1, int r1, int s2, int r2)
 369    {
 370      int score=0;
 371
 372      return score;
 373
 374    }
 375 int custom_pair_score_function3 (Constraint_list *CL, int s1, int r1, int s2, int r2)
 376    {
 377      int score=0;
 378
 379      return score;
 380
 381    }
 382 int custom_pair_score_function4 (Constraint_list *CL, int s1, int r1, int s2, int r2)
 383    {
 384      int score=0;
 385
 386      return score;
 387
 388    }
 389 int custom_pair_score_function5 (Constraint_list *CL, int s1, int r1, int s2, int r2)
 390    {
 391      int score=0;
 392
 393      return score;
 394
 395    }
 396 int custom_pair_score_function6 (Constraint_list *CL, int s1, int r1, int s2, int r2)
 397    {
 398      int score=0;
 399
 400      return score;
 401
 402    }
 403 int custom_pair_score_function7 (Constraint_list *CL, int s1, int r1, int s2, int r2)
 404    {
 405      int score=0;
 406
 407      return score;
 408
 409    }
 410 int custom_pair_score_function8 (Constraint_list *CL, int s1, int r1, int s2, int r2)
 411    {
 412      int score=0;
 413
 414      return score;
 415
 416    }
 417
 418 int custom_pair_score_function9 (Constraint_list *CL, int s1, int r1, int s2, int r2)
 419    {
 420      int score=0;
 421
 422      return score;
 423
 424    }
 425 int custom_pair_score_function10 (Constraint_list *CL, int s1, int r1, int s2, int r2)
 426    {
 427      int score=0;
 428
 429      return score;
 430
 431    }
 432 /******************************COPYRIGHT NOTICE*******************************/
 433 /*© Centro de Regulacio Genomica */
 434 /*and */
 435 /*Cedric Notredame */
 436 /*Fri Feb 18 08:27:45 CET 2011 - Revision 596. */
 437 /*All rights reserved.*/
 438 /*This file is part of T-COFFEE.*/
 439 /**/
 440 /*    T-COFFEE is free software; you can redistribute it and/or modify*/
 441 /*    it under the terms of the GNU General Public License as published by*/
 442 /*    the Free Software Foundation; either version 2 of the License, or*/
 443 /*    (at your option) any later version.*/
 444 /**/
 445 /*    T-COFFEE is distributed in the hope that it will be useful,*/
 446 /*    but WITHOUT ANY WARRANTY; without even the implied warranty of*/
 447 /*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the*/
 448 /*    GNU General Public License for more details.*/
 449 /**/
 450 /*    You should have received a copy of the GNU General Public License*/
 451 /*    along with Foobar; if not, write to the Free Software*/
 452 /*    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA*/
 453 /*...............................................                                                                                      |*/
 454 /*  If you need some more information*/
 455 /*  cedric.notredame@europe.com*/
 456 /*...............................................                                                                                                                                     |*/
 457 /**/
 458 /**/
 459 /*      */
 460 /******************************COPYRIGHT NOTICE*******************************/