src/jalview/analysis/scoremodels/SecondaryStructureDistanceModel.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.analysis.scoremodels;
  22
  23 import jalview.api.AlignmentViewPanel;
  24 import jalview.api.FeatureRenderer;
  25 import jalview.api.analysis.ScoreModelI;
  26 import jalview.api.analysis.SimilarityParamsI;
  27 import jalview.datamodel.AlignmentAnnotation;
  28 import jalview.datamodel.AlignmentView;
  29 import jalview.datamodel.Annotation;
  30 import jalview.datamodel.SeqCigar;
  31 import jalview.math.Matrix;
  32 import jalview.math.MatrixI;
  33 import jalview.util.SetUtils;
  34
  35 import java.util.HashSet;
  36 import java.util.Set;
  37
  38 /* This class contains methods to calculate distance score between
  39  * secondary structure annotations of the sequences. The inverse of
  40  * the score is later calculated for similarity score.
  41  */
  42 public class SecondaryStructureDistanceModel extends DistanceScoreModel
  43 {
  44   private static final String NAME = "Secondary Structure Similarity";
  45
  46   private static final String SS_ANNOTATION_LABEL = "Secondary Structure";
  47
  48   private String description;
  49
  50   //maximum distance score is defined as 2 as the possible number of unique ss is 2.
  51   private static final int MAX_SCORE = 2;
  52
  53   //minimum distance score is defined as 2 as the possible number of unique ss is 2.
  54   private static final int MIN_SCORE = 0;
  55
  56   private static final char COIL = 'C';
  57
  58   FeatureRenderer fr;
  59
  60   /**
  61    * Constructor
  62    */
  63   public SecondaryStructureDistanceModel()
  64   {
  65
  66   }
  67
  68   @Override
  69   public ScoreModelI getInstance(AlignmentViewPanel view)
  70   {
  71     SecondaryStructureDistanceModel instance;
  72     try
  73     {
  74       instance = this.getClass().getDeclaredConstructor().newInstance();
  75       instance.configureFromAlignmentView(view);
  76       return instance;
  77     } catch (InstantiationException | IllegalAccessException e)
  78     {
  79       jalview.bin.Console.errPrintln("Error in " + getClass().getName()
  80               + ".getInstance(): " + e.getMessage());
  81       return null;
  82     } catch (ReflectiveOperationException roe)
  83     {
  84       return null;
  85     }
  86   }
  87
  88   boolean configureFromAlignmentView(AlignmentViewPanel view)
  89
  90   {
  91     fr = view.cloneFeatureRenderer();
  92     return true;
  93   }
  94
  95   /**
  96    * Calculates a distance measure [i][j] between each pair of sequences as the
  97    * average number of features they have but do not share. That is, find the
  98    * features each sequence pair has at each column, ignore feature types they
  99    * have in common, and count the rest. The totals are normalised by the number
 100    * of columns processed.
 101    * <p>
 102    * The parameters argument provides settings for treatment of gap-residue
 103    * aligned positions, and whether the score is over the longer or shorter of
 104    * each pair of sequences
 105    *
 106    * @param seqData
 107    * @param params
 108    */
 109
 110   /**
 111    * Calculates distance score [i][j] between each pair of protein sequences
 112    * based on their secondary structure annotations (H, E, C). That is, find the
 113    * secondary structures each sequence has at each column and scores positively for
 114    * each non similar secondary structure annotations. Scores 0 for similar secondary
 115    * structure annotations. The final score is normalized by the number of
 116    * alignment columns processed, providing an average similarity score.
 117    * <p>
 118    * The parameters argument can include settings for handling gap-residue aligned
 119    * positions and may determine if the score calculation is based on the longer or shorter
 120    * sequence in each pair. This can be important for handling partial alignments or
 121    * sequences of significantly different lengths.
 122    *
 123    * @param seqData  The aligned sequence data including secondary structure annotations.
 124    * @param params   Additional parameters for customizing the scoring process, such as gap
 125    *                 handling and sequence length consideration.
 126    */
 127   @Override
 128   public MatrixI findDistances(AlignmentView seqData,
 129           SimilarityParamsI params)
 130   {
 131     SeqCigar[] seqs = seqData.getSequences();
 132     int noseqs = seqs.length; //no of sequences
 133     int cpwidth = 0; // = seqData.getWidth();
 134     double[][] distances = new double[noseqs][noseqs]; //matrix to store distance score
 135
 136     // need to get real position for view position
 137     int[] viscont = seqData.getVisibleContigs();
 138
 139     Set<SeqCigar> seqsWithUndefinedSS = findSeqsWithUndefinedSS(seqs);
 140
 141
 142     /*
 143      * scan each column, compute and add to each distance[i, j]
 144      * the number of secondary structure annotation that seqi
 145      * and seqj do not share
 146      */
 147     for (int vc = 0; vc < viscont.length; vc += 2)
 148     {
 149       //Iterates for each column position
 150       for (int cpos = viscont[vc]; cpos <= viscont[vc + 1]; cpos++)
 151       {
 152         cpwidth++; //used to normalise the distance score
 153
 154         /*
 155          * get set of sequences without gap in the current column
 156          */
 157         Set<SeqCigar> seqsWithoutGapAtCol = findSeqsWithoutGapAtColumn(seqs, cpos);
 158
 159         /*
 160          * count score for each dissimilar secondary structure annotation on i'th and j'th
 161          * sequence. Igonre if similar and add this 'distance' measure to the total
 162          * for [i, j] for j > i
 163          */
 164         for (int i = 0; i < (noseqs - 1); i++)
 165         {
 166           //Iterates for each sequences
 167           for (int j = i + 1; j < noseqs; j++)
 168           {
 169             SeqCigar sc1 = seqs[i];
 170             SeqCigar sc2 = seqs[j];
 171
 172             //check if ss is defined
 173             boolean undefinedSS1 = seqsWithUndefinedSS.contains(sc1);
 174             boolean undefinedSS2 = seqsWithUndefinedSS.contains(sc2);
 175
 176             // Set distance to 0 if both SS are not defined
 177             if (undefinedSS1 && undefinedSS2) {
 178                 distances[i][j] += MIN_SCORE;
 179                 continue;
 180             }
 181
 182             // Set distance to maximum score if either one SS is not defined
 183             else if(undefinedSS1 || undefinedSS2) {
 184                 distances[i][j] += MAX_SCORE;
 185                 continue;
 186             }
 187
 188             //check if the sequence contains gap in the current column
 189             boolean gap1 = !seqsWithoutGapAtCol.contains(sc1);
 190             boolean gap2 = !seqsWithoutGapAtCol.contains(sc2);
 191
 192             //Variable to store secondary structure at the current column
 193             Set<String> secondaryStructure1 = new HashSet<String>();
 194             Set<String> secondaryStructure2 = new HashSet<String>();
 195
 196             //secondary structure is fetched only if the current column is not
 197             //gap for the sequence
 198             if(!gap1 && !undefinedSS1) {
 199                 secondaryStructure1.addAll(
 200                                 findSSAnnotationForGivenSeqAndCol(seqs[i], cpos));
 201             }
 202
 203             if(!gap2 && !undefinedSS2) {
 204                 secondaryStructure2.addAll(
 205                                 findSSAnnotationForGivenSeqAndCol(seqs[j], cpos));
 206             }
 207
 208             /*
 209              * gap-gap always scores zero
 210              * residue-residue is always scored
 211              * include gap-residue score if params say to do so
 212              */
 213             if ((!gap1 && !gap2) || params.includeGaps())
 214             {
 215               int seqDistance = SetUtils.countDisjunction(
 216                           secondaryStructure1, secondaryStructure2);
 217               distances[i][j] += seqDistance;
 218             }
 219           }
 220         }
 221       }
 222     }
 223
 224     /*
 225      * normalise the distance scores (summed over columns) by the
 226      * number of visible columns used in the calculation
 227      * and fill in the bottom half of the matrix
 228      */
 229     // TODO JAL-2424 cpwidth may be out by 1 - affects scores but not tree shape
 230     for (int i = 0; i < noseqs; i++)
 231     {
 232       for (int j = i + 1; j < noseqs; j++)
 233       {
 234         distances[i][j] /= cpwidth;
 235         distances[j][i] = distances[i][j];
 236       }
 237     }
 238     return new Matrix(distances);
 239   }
 240
 241   /**
 242    * Builds and returns a set containing sequences (SeqCigar) which do not
 243    * have a gap at the given column position.
 244    *
 245    * @param seqs
 246    * @param columnPosition
 247    *          (0..)
 248    * @return
 249    */
 250   protected Set<SeqCigar> findSeqsWithoutGapAtColumn(
 251           SeqCigar[] seqs, int columnPosition)
 252   {
 253     Set<SeqCigar> seqsWithoutGapAtCol = new HashSet<>();
 254     for (SeqCigar seq : seqs)
 255     {
 256       int spos = seq.findPosition(columnPosition);
 257       if (spos != -1)
 258       {
 259         /*
 260          * position is not a gap
 261          */
 262           seqsWithoutGapAtCol.add(seq);
 263       }
 264     }
 265     return seqsWithoutGapAtCol;
 266   }
 267
 268   /**
 269    * Builds and returns a set containing sequences (SeqCigar) which have
 270    * no secondary structures defined
 271    *
 272    * @param seqs
 273    *          (0..)
 274    * @return
 275    */
 276   protected Set<SeqCigar> findSeqsWithUndefinedSS(
 277           SeqCigar[] seqs)
 278   {
 279     Set<SeqCigar> seqsWithUndefinedSS = new HashSet<>();
 280     for (SeqCigar seq : seqs)
 281     {
 282
 283       AlignmentAnnotation[] aa = seq.getRefSeq().getAnnotation(SS_ANNOTATION_LABEL);
 284       if (aa == null) {
 285          /*
 286          * secondary structure is undefined for the seq
 287          * Add seq in the set
 288          */
 289         seqsWithUndefinedSS.add(seq);
 290       }
 291     }
 292     return seqsWithUndefinedSS;
 293   }
 294
 295   /**
 296    * Finds secondary structure annotation for a given sequence (SeqCigar)
 297    * and column position corresponding to the sequence.
 298    *
 299    * @param seq
 300    * @param columnPosition
 301    *          (0..)
 302    * @return
 303    */
 304   private Set<String> findSSAnnotationForGivenSeqAndCol(
 305                   SeqCigar seq, int columnPosition)
 306   {
 307           Set<String> secondaryStructure = new HashSet<String>();
 308
 309           char ss;
 310
 311           //fetch the position in sequence for the column and finds the
 312           //corresponding secondary structure annotation
 313           int seqPosition = seq.findPosition(columnPosition);
 314           AlignmentAnnotation[] aa = seq.getRefSeq().getAnnotation(SS_ANNOTATION_LABEL);
 315           if (aa != null) {
 316             if (aa[0].getAnnotationForPosition(seqPosition) != null) {
 317               Annotation a = aa[0].getAnnotationForPosition(seqPosition);
 318               ss = a.secondaryStructure;
 319
 320               //There is no representation for coil and it can be either ' ' or null.
 321               if (ss == ' ') {
 322                 ss = COIL;
 323               }
 324             }
 325             else {
 326               ss = COIL;
 327             }
 328             secondaryStructure.add(String.valueOf(ss));
 329           }
 330
 331           return secondaryStructure;
 332   }
 333
 334
 335   @Override
 336   public String getName()
 337   {
 338     return NAME;
 339   }
 340
 341   @Override
 342   public String getDescription()
 343   {
 344     return description;
 345   }
 346
 347   @Override
 348   public boolean isDNA()
 349   {
 350     return false;
 351   }
 352
 353   @Override
 354   public boolean isProtein()
 355   {
 356     return false;
 357   }
 358
 359   @Override
 360   public boolean isSecondaryStructure()
 361   {
 362     return true;
 363   }
 364
 365   @Override
 366   public String toString()
 367   {
 368     return "Score between sequences based on hamming distance between binary vectors marking secondary structure displayed at each column";
 369   }
 370 }