src/jalview/analysis/scoremodels/SecondaryStructureDistanceModel.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.analysis.scoremodels;
  22
  23 import jalview.api.AlignmentViewPanel;
  24 import jalview.api.FeatureRenderer;
  25 import jalview.api.analysis.ScoreModelI;
  26 import jalview.api.analysis.SimilarityParamsI;
  27 import jalview.datamodel.AlignmentAnnotation;
  28 import jalview.datamodel.AlignmentView;
  29 import jalview.datamodel.Annotation;
  30 import jalview.datamodel.SeqCigar;
  31 import jalview.math.Matrix;
  32 import jalview.math.MatrixI;
  33 import jalview.util.SetUtils;
  34
  35 import java.util.HashMap;
  36 import java.util.HashSet;
  37 import java.util.Map;
  38 import java.util.Set;
  39
  40 /* This class contains methods to calculate distance score between
  41  * secondary structure annotations of the sequences. The inverse of
  42  * the score is later calculated for similarity score.
  43  */
  44 public class SecondaryStructureDistanceModel extends DistanceScoreModel
  45 {
  46   private static final String NAME = "Secondary Structure Similarity";
  47
  48   //label in secondary structure annotation data model from 3d structures
  49   private static final String SS_ANNOTATION_LABEL = "Secondary Structure";
  50
  51   //label in secondary structure annotation data model from JPred
  52   private static final String SS_ANNOTATION_FROM_JPRED_LABEL = "jnetpred";
  53
  54   //label in secondary structure annotation data model from JPred
  55   private static final String JPRED_WEBSERVICE = "JPred";
  56
  57   private String description;
  58
  59   //maximum distance score is defined as 2 as the possible number of unique secondary structure is 2.
  60   private static final int MAX_SCORE = 2;
  61
  62   //minimum distance score is defined as 2 as the possible number of unique secondary structure is 2.
  63   private static final int MIN_SCORE = 0;
  64
  65   //character used to represent coil in secondary structure
  66   private static final char COIL = 'C';
  67
  68   FeatureRenderer fr;
  69
  70   /*
  71    * Annotation label for available sources of secondary structure
  72    */
  73   private static final String[] SS_ANNOTATION_LABELS = {
  74       SS_ANNOTATION_LABEL,
  75       SS_ANNOTATION_FROM_JPRED_LABEL
  76   };
  77
  78   /**
  79    * Constructor
  80    */
  81   public SecondaryStructureDistanceModel()
  82   {
  83
  84   }
  85
  86   @Override
  87   public ScoreModelI getInstance(AlignmentViewPanel view)
  88   {
  89     SecondaryStructureDistanceModel instance;
  90     try
  91     {
  92       instance = this.getClass().getDeclaredConstructor().newInstance();
  93       instance.configureFromAlignmentView(view);
  94       return instance;
  95     } catch (InstantiationException | IllegalAccessException e)
  96     {
  97       jalview.bin.Console.errPrintln("Error in " + getClass().getName()
  98               + ".getInstance(): " + e.getMessage());
  99       return null;
 100     } catch (ReflectiveOperationException roe)
 101     {
 102       return null;
 103     }
 104   }
 105
 106   boolean configureFromAlignmentView(AlignmentViewPanel view)
 107
 108   {
 109     fr = view.cloneFeatureRenderer();
 110     return true;
 111   }
 112
 113   /**
 114    * Calculates a distance measure [i][j] between each pair of sequences as the
 115    * average number of features they have but do not share. That is, find the
 116    * features each sequence pair has at each column, ignore feature types they
 117    * have in common, and count the rest. The totals are normalised by the number
 118    * of columns processed.
 119    * <p>
 120    * The parameters argument provides settings for treatment of gap-residue
 121    * aligned positions, and whether the score is over the longer or shorter of
 122    * each pair of sequences
 123    *
 124    * @param seqData
 125    * @param params
 126    */
 127
 128   /**
 129    * Calculates distance score [i][j] between each pair of protein sequences
 130    * based on their secondary structure annotations (H, E, C). That is, find the
 131    * secondary structures each sequence has at each column and scores positively for
 132    * each non similar secondary structure annotations. Scores 0 for similar secondary
 133    * structure annotations. The final score is normalized by the number of
 134    * alignment columns processed, providing an average similarity score.
 135    * <p>
 136    * The parameters argument can include settings for handling gap-residue aligned
 137    * positions and may determine if the score calculation is based on the longer or shorter
 138    * sequence in each pair. This can be important for handling partial alignments or
 139    * sequences of significantly different lengths.
 140    *
 141    * @param seqData  The aligned sequence data including secondary structure annotations.
 142    * @param params   Additional parameters for customizing the scoring process, such as gap
 143    *                 handling and sequence length consideration.
 144    */
 145   @Override
 146   public MatrixI findDistances(AlignmentView seqData,
 147           SimilarityParamsI params)
 148   {
 149
 150     SeqCigar[] seqs = seqData.getSequences();
 151     int noseqs = seqs.length; //no of sequences
 152     int cpwidth = 0; // = seqData.getWidth();
 153     double[][] distances = new double[noseqs][noseqs]; //matrix to store distance score
 154     double[][] substitutionMatrix = getSubstitutionMatrix();
 155     //secondary structure source parameter selected by the user from the drop down.
 156     String ssSource = params.getSecondaryStructureSource();
 157
 158     //defining the default value for secondary structure source as 3d structures
 159     //or JPred if user selected JPred
 160     String selectedSSSource = SS_ANNOTATION_LABEL;
 161     if(ssSource.equals(JPRED_WEBSERVICE))
 162       selectedSSSource = SS_ANNOTATION_FROM_JPRED_LABEL;
 163
 164     // need to get real position for view position
 165     int[] viscont = seqData.getVisibleContigs();
 166
 167     /*
 168      * Add secondary structure annotations that are added to the annotation track
 169      * to the map
 170      */
 171     Map<String, HashSet<String>> ssAlignmentAnnotationForSequences
 172       = new HashMap<String,HashSet<String>>();
 173
 174     AlignmentAnnotation[] alignAnnotList = fr.getViewport().getAlignment()
 175             .getAlignmentAnnotation();
 176
 177     if(alignAnnotList.length > 0) {
 178       for (AlignmentAnnotation aa: alignAnnotList) {
 179         if (selectedSSSource.equals(aa.label)) {
 180           ssAlignmentAnnotationForSequences.computeIfAbsent(
 181                   aa.sequenceRef.getName(), k -> new HashSet<>())
 182           .add(aa.description);
 183         }
 184       }
 185     }
 186
 187     /*
 188      * Get the set of sequences which are not considered for the calculation.
 189      * Following sequences are added:
 190      * 1. Sequences without a defined secondary structure from the selected
 191      * source.
 192      * 2. Sequences whose secondary structure annotations are not added to
 193      * the annotation track
 194      */
 195     Set<SeqCigar> seqsWithUndefinedSS
 196         = findSeqsWithUndefinedSS(seqs, ssAlignmentAnnotationForSequences);
 197
 198     /*
 199      * scan each column, compute and add to each distance[i, j]
 200      * the number of secondary structure annotation that seqi
 201      * and seqj do not share
 202      */
 203     for (int vc = 0; vc < viscont.length; vc += 2)
 204     {
 205       //Iterates for each column position
 206       for (int cpos = viscont[vc]; cpos <= viscont[vc + 1]; cpos++)
 207       {
 208         cpwidth++; //used to normalise the distance score
 209
 210         /*
 211          * get set of sequences without gap in the current column
 212          */
 213         Set<SeqCigar> seqsWithoutGapAtCol = findSeqsWithoutGapAtColumn(seqs, cpos);
 214
 215         /*
 216          * count score for each dissimilar secondary structure annotation on i'th and j'th
 217          * sequence. Igonre if similar and add this 'distance' measure to the total
 218          * for [i, j] for j > i
 219          */
 220         for (int i = 0; i < (noseqs - 1); i++)
 221         {
 222           //Iterates for each sequences
 223           for (int j = i + 1; j < noseqs; j++)
 224           {
 225             SeqCigar sc1 = seqs[i];
 226             SeqCigar sc2 = seqs[j];
 227
 228
 229             //check if ss is defined
 230             boolean undefinedSS1 = seqsWithUndefinedSS.contains(sc1);
 231             boolean undefinedSS2 = seqsWithUndefinedSS.contains(sc2);
 232
 233             // Set distance to 0 if both SS are not defined
 234             if (undefinedSS1 && undefinedSS2) {
 235                 distances[i][j] += MIN_SCORE;
 236                 continue;
 237             }
 238
 239             // Set distance to maximum score if either one SS is not defined
 240             else if(undefinedSS1 || undefinedSS2) {
 241                 distances[i][j] += MAX_SCORE;
 242                 continue;
 243             }
 244
 245             //check if the sequence contains gap in the current column
 246             boolean gap1 = !seqsWithoutGapAtCol.contains(sc1);
 247             boolean gap2 = !seqsWithoutGapAtCol.contains(sc2);
 248
 249             //Variable to store secondary structure at the current column
 250             char ss1 = 'G', ss2 = 'G';
 251
 252             //secondary structure is fetched only if the current column is not
 253             //gap for the sequence
 254             if(!gap1 && !undefinedSS1) {
 255               ss1 =
 256                   findSSAnnotationForGivenSeqAndCol(seqs[i], cpos);
 257             }
 258
 259             if(!gap2 && !undefinedSS2) {
 260               ss2 =
 261                   findSSAnnotationForGivenSeqAndCol(seqs[j], cpos);
 262             }
 263
 264             /*
 265              * gap-gap scores zero
 266              * similar ss-ss scores zero
 267              * different ss-ss scores 1
 268              * gap-ss scores 1 if params say to do so
 269              */
 270             if ((!gap1 && !gap2) || params.includeGaps())
 271             {
 272               // Calculate distance score based on the substitution matrix
 273               double seqDistance = substitutionMatrix[getSubstitutionMatrixIndex(ss1)][getSubstitutionMatrixIndex(ss2)];
 274               distances[i][j] += seqDistance;
 275             }
 276           }
 277         }
 278       }
 279     }
 280
 281     /*
 282      * normalise the distance scores (summed over columns) by the
 283      * number of visible columns used in the calculation
 284      * and fill in the bottom half of the matrix
 285      */
 286     // TODO JAL-2424 cpwidth may be out by 1 - affects scores but not tree shape
 287     for (int i = 0; i < noseqs; i++)
 288     {
 289       for (int j = i + 1; j < noseqs; j++)
 290       {
 291         distances[i][j] /= cpwidth;
 292         distances[j][i] = distances[i][j];
 293       }
 294     }
 295     return new Matrix(distances);
 296   }
 297
 298   /**
 299    * Builds and returns a set containing sequences (SeqCigar) which do not
 300    * have a gap at the given column position.
 301    *
 302    * @param seqs
 303    * @param columnPosition
 304    *          (0..)
 305    * @return
 306    */
 307   protected Set<SeqCigar> findSeqsWithoutGapAtColumn(
 308           SeqCigar[] seqs, int columnPosition)
 309   {
 310     Set<SeqCigar> seqsWithoutGapAtCol = new HashSet<>();
 311     for (SeqCigar seq : seqs)
 312     {
 313       int spos = seq.findPosition(columnPosition);
 314       if (spos != -1)
 315       {
 316         /*
 317          * position is not a gap
 318          */
 319         seqsWithoutGapAtCol.add(seq);
 320       }
 321     }
 322     return seqsWithoutGapAtCol;
 323   }
 324
 325
 326   /**
 327    * Builds and returns a set containing sequences (SeqCigar) which
 328    * are not considered for the similarity calculation.
 329    * Following sequences are added:
 330    * 1. Sequences without a defined secondary structure from the selected
 331    * source.
 332    * 2. Sequences whose secondary structure annotations are not added to
 333    * the annotation track
 334    * @param seqs
 335    * @param ssAlignmentAnnotationForSequences
 336    * @return
 337    */
 338   protected Set<SeqCigar> findSeqsWithUndefinedSS(SeqCigar[] seqs,
 339           Map<String, HashSet<String>> ssAlignmentAnnotationForSequences) {
 340       Set<SeqCigar> seqsWithUndefinedSS = new HashSet<>();
 341       for (SeqCigar seq : seqs) {
 342           if (isSSUndefinedOrNotAdded(seq, ssAlignmentAnnotationForSequences)) {
 343               seqsWithUndefinedSS.add(seq);
 344           }
 345       }
 346       return seqsWithUndefinedSS;
 347   }
 348
 349
 350   /**
 351    * Returns true if a sequence (SeqCigar) should not be
 352    * considered for the similarity calculation.
 353    * Following conditions are checked:
 354    * 1. Sequence without a defined secondary structure from the selected
 355    * source.
 356    * 2. Sequences whose secondary structure annotations are not added to
 357    * the annotation track
 358    * @param seq
 359    * @param ssAlignmentAnnotationForSequences
 360    * @return
 361    */
 362   private boolean isSSUndefinedOrNotAdded(SeqCigar seq,
 363           Map<String, HashSet<String>> ssAlignmentAnnotationForSequences) {
 364       for (String label : SS_ANNOTATION_LABELS) {
 365           AlignmentAnnotation[] annotations = seq.getRefSeq().getAnnotation(label);
 366           if (annotations != null) {
 367               for (AlignmentAnnotation annotation : annotations) {
 368                 HashSet<String> descriptionSet = ssAlignmentAnnotationForSequences
 369                         .get(annotation.sequenceRef.getName());
 370                 if (descriptionSet != null)
 371                   if (descriptionSet.contains(annotation.description)) {
 372                       // Secondary structure annotation is present and
 373                       //added to the track, no need to add seq
 374                       return false;
 375                   }
 376               }
 377           }
 378       }
 379       // Either annotations are undefined or not added to the track
 380       return true;
 381   }
 382
 383
 384   /**
 385    * Finds secondary structure annotation for a given sequence (SeqCigar)
 386    * and column position corresponding to the sequence.
 387    *
 388    * @param seq
 389    * @param columnPosition
 390    *          (0..)
 391    * @return
 392    */
 393   private char findSSAnnotationForGivenSeqAndCol(
 394       SeqCigar seq, int columnPosition)
 395   {
 396     char ss = 'G';
 397
 398     //fetch the position in sequence for the column and finds the
 399     //corresponding secondary structure annotation
 400     //TO DO - consider based on priority
 401     int seqPosition = seq.findPosition(columnPosition);
 402     AlignmentAnnotation[] aa = seq.getRefSeq().getAnnotation(SS_ANNOTATION_LABEL);
 403
 404     if(aa == null) {
 405       aa = seq.getRefSeq().getAnnotation(SS_ANNOTATION_FROM_JPRED_LABEL);
 406     }
 407
 408     if (aa != null) {
 409       if (aa[0].getAnnotationForPosition(seqPosition) != null) {
 410         Annotation a = aa[0].getAnnotationForPosition(seqPosition);
 411         ss = a.secondaryStructure;
 412
 413         //There is no representation for coil and it can be either ' ' or null.
 414         if (ss == ' ' || ss == '-') {
 415           ss = COIL;
 416         }
 417       }
 418       else {
 419         ss = COIL;
 420       }
 421
 422     }
 423
 424     return ss;
 425   }
 426
 427   /**
 428    * Retrieve the substitution matrix.
 429    *
 430    * @return The substitution matrix.
 431    */
 432   private double[][] getSubstitutionMatrix() {
 433       // Defining the substitution matrix
 434       // This matrix map distance scores between secondary structure symbols
 435
 436       return new double[][]{
 437               // C   E   H  G
 438               {0.0, 1.0, 1.0, 1.0}, // C - COIL
 439               {1.0, 0.0, 1.0, 1.0}, // E - SHEET
 440               {1.0, 1.0, 0.0, 1.0}, // H - HELIX
 441               {1.0, 1.0, 1.0, 0.0} // G - GAP
 442
 443       };
 444   }
 445
 446   private int getSubstitutionMatrixIndex(char ss) {
 447     switch (ss) {
 448         case 'C':
 449             return 0;
 450         case 'E':
 451             return 1;
 452         case 'H':
 453             return 2;
 454         case 'G':
 455           return 3;
 456         default:
 457             throw new IllegalArgumentException("Invalid secondary structure character: " + ss);
 458     }
 459   }
 460
 461   @Override
 462   public String getName()
 463   {
 464     return NAME;
 465   }
 466
 467   @Override
 468   public String getDescription()
 469   {
 470     return description;
 471   }
 472
 473   @Override
 474   public boolean isDNA()
 475   {
 476     return false;
 477   }
 478
 479   @Override
 480   public boolean isProtein()
 481   {
 482     return false;
 483   }
 484
 485   @Override
 486   public boolean isSecondaryStructure()
 487   {
 488     return true;
 489   }
 490
 491   @Override
 492   public String toString()
 493   {
 494     return "Score between sequences based on hamming distance between binary "
 495             + "vectors marking secondary structure displayed at each column";
 496   }
 497 }