src/jalview/analysis/scoremodels/SecondaryStructureDistanceModel.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.analysis.scoremodels;
  22
  23 import jalview.api.AlignmentViewPanel;
  24 import jalview.api.FeatureRenderer;
  25 import jalview.api.analysis.ScoreModelI;
  26 import jalview.api.analysis.SimilarityParamsI;
  27 import jalview.datamodel.AlignmentAnnotation;
  28 import jalview.datamodel.AlignmentView;
  29 import jalview.datamodel.Annotation;
  30 import jalview.datamodel.SeqCigar;
  31 import jalview.math.Matrix;
  32 import jalview.math.MatrixI;
  33 import jalview.util.SetUtils;
  34
  35 import java.util.HashMap;
  36 import java.util.HashSet;
  37 import java.util.Map;
  38 import java.util.Set;
  39
  40 /* This class contains methods to calculate distance score between
  41  * secondary structure annotations of the sequences. The inverse of
  42  * the score is later calculated for similarity score.
  43  */
  44 public class SecondaryStructureDistanceModel extends DistanceScoreModel
  45 {
  46   private static final String NAME = "Secondary Structure Similarity";
  47
  48   private static final String SS_ANNOTATION_LABEL = "Secondary Structure";
  49
  50   private static final String SS_ANNOTATION_FROM_JPRED_LABEL = "jnetpred";
  51
  52   private String description;
  53
  54   //maximum distance score is defined as 2 as the possible number of unique ss is 2.
  55   private static final int MAX_SCORE = 2;
  56
  57   //minimum distance score is defined as 2 as the possible number of unique ss is 2.
  58   private static final int MIN_SCORE = 0;
  59
  60   private static final char COIL = 'C';
  61
  62   FeatureRenderer fr;
  63
  64   /**
  65    * Constructor
  66    */
  67   public SecondaryStructureDistanceModel()
  68   {
  69
  70   }
  71
  72   @Override
  73   public ScoreModelI getInstance(AlignmentViewPanel view)
  74   {
  75     SecondaryStructureDistanceModel instance;
  76     try
  77     {
  78       instance = this.getClass().getDeclaredConstructor().newInstance();
  79       instance.configureFromAlignmentView(view);
  80       return instance;
  81     } catch (InstantiationException | IllegalAccessException e)
  82     {
  83       jalview.bin.Console.errPrintln("Error in " + getClass().getName()
  84               + ".getInstance(): " + e.getMessage());
  85       return null;
  86     } catch (ReflectiveOperationException roe)
  87     {
  88       return null;
  89     }
  90   }
  91
  92   boolean configureFromAlignmentView(AlignmentViewPanel view)
  93
  94   {
  95     fr = view.cloneFeatureRenderer();
  96     return true;
  97   }
  98
  99   /**
 100    * Calculates a distance measure [i][j] between each pair of sequences as the
 101    * average number of features they have but do not share. That is, find the
 102    * features each sequence pair has at each column, ignore feature types they
 103    * have in common, and count the rest. The totals are normalised by the number
 104    * of columns processed.
 105    * <p>
 106    * The parameters argument provides settings for treatment of gap-residue
 107    * aligned positions, and whether the score is over the longer or shorter of
 108    * each pair of sequences
 109    *
 110    * @param seqData
 111    * @param params
 112    */
 113
 114   /**
 115    * Calculates distance score [i][j] between each pair of protein sequences
 116    * based on their secondary structure annotations (H, E, C). That is, find the
 117    * secondary structures each sequence has at each column and scores positively for
 118    * each non similar secondary structure annotations. Scores 0 for similar secondary
 119    * structure annotations. The final score is normalized by the number of
 120    * alignment columns processed, providing an average similarity score.
 121    * <p>
 122    * The parameters argument can include settings for handling gap-residue aligned
 123    * positions and may determine if the score calculation is based on the longer or shorter
 124    * sequence in each pair. This can be important for handling partial alignments or
 125    * sequences of significantly different lengths.
 126    *
 127    * @param seqData  The aligned sequence data including secondary structure annotations.
 128    * @param params   Additional parameters for customizing the scoring process, such as gap
 129    *                 handling and sequence length consideration.
 130    */
 131   @Override
 132   public MatrixI findDistances(AlignmentView seqData,
 133           SimilarityParamsI params)
 134   {
 135
 136     SeqCigar[] seqs = seqData.getSequences();
 137     int noseqs = seqs.length; //no of sequences
 138     int cpwidth = 0; // = seqData.getWidth();
 139     double[][] distances = new double[noseqs][noseqs]; //matrix to store distance score
 140
 141     // need to get real position for view position
 142     int[] viscont = seqData.getVisibleContigs();
 143     Map<String, HashSet<String>> calcIdMapInAlignmentAnnotation = new HashMap<String,HashSet<String>>();
 144
 145     AlignmentAnnotation[] alignAnnotList = fr.getViewport().getAlignment().getAlignmentAnnotation();
 146     if(alignAnnotList.length > 0) {
 147
 148       for (AlignmentAnnotation aa: alignAnnotList) {
 149         if (SS_ANNOTATION_LABEL.equals(aa.label) || SS_ANNOTATION_FROM_JPRED_LABEL.equals(aa.label)) {
 150             calcIdMapInAlignmentAnnotation.computeIfAbsent(aa.getCalcId(), k -> new HashSet<>()).add(aa.description);
 151         }
 152
 153       }
 154     }
 155
 156
 157     Set<SeqCigar> seqsWithUndefinedSS = findSeqsWithUndefinedSS(seqs, calcIdMapInAlignmentAnnotation);
 158
 159
 160     /*
 161      * scan each column, compute and add to each distance[i, j]
 162      * the number of secondary structure annotation that seqi
 163      * and seqj do not share
 164      */
 165     for (int vc = 0; vc < viscont.length; vc += 2)
 166     {
 167       //Iterates for each column position
 168       for (int cpos = viscont[vc]; cpos <= viscont[vc + 1]; cpos++)
 169       {
 170         cpwidth++; //used to normalise the distance score
 171
 172         /*
 173          * get set of sequences without gap in the current column
 174          */
 175         Set<SeqCigar> seqsWithoutGapAtCol = findSeqsWithoutGapAtColumn(seqs, cpos);
 176
 177         /*
 178          * count score for each dissimilar secondary structure annotation on i'th and j'th
 179          * sequence. Igonre if similar and add this 'distance' measure to the total
 180          * for [i, j] for j > i
 181          */
 182         for (int i = 0; i < (noseqs - 1); i++)
 183         {
 184           //Iterates for each sequences
 185           for (int j = i + 1; j < noseqs; j++)
 186           {
 187             SeqCigar sc1 = seqs[i];
 188             SeqCigar sc2 = seqs[j];
 189
 190
 191             //check if ss is defined
 192             boolean undefinedSS1 = seqsWithUndefinedSS.contains(sc1);
 193             boolean undefinedSS2 = seqsWithUndefinedSS.contains(sc2);
 194
 195             // Set distance to 0 if both SS are not defined
 196             if (undefinedSS1 && undefinedSS2) {
 197                 distances[i][j] += MIN_SCORE;
 198                 continue;
 199             }
 200
 201             // Set distance to maximum score if either one SS is not defined
 202             else if(undefinedSS1 || undefinedSS2) {
 203                 distances[i][j] += MAX_SCORE;
 204                 continue;
 205             }
 206
 207             //check if the sequence contains gap in the current column
 208             boolean gap1 = !seqsWithoutGapAtCol.contains(sc1);
 209             boolean gap2 = !seqsWithoutGapAtCol.contains(sc2);
 210
 211             //Variable to store secondary structure at the current column
 212             Set<String> secondaryStructure1 = new HashSet<String>();
 213             Set<String> secondaryStructure2 = new HashSet<String>();
 214
 215             //secondary structure is fetched only if the current column is not
 216             //gap for the sequence
 217             if(!gap1 && !undefinedSS1) {
 218                 secondaryStructure1.addAll(
 219                                 findSSAnnotationForGivenSeqAndCol(seqs[i], cpos));
 220             }
 221
 222             if(!gap2 && !undefinedSS2) {
 223                 secondaryStructure2.addAll(
 224                                 findSSAnnotationForGivenSeqAndCol(seqs[j], cpos));
 225             }
 226
 227             /*
 228              * gap-gap always scores zero
 229              * ss-ss is always scored
 230              * include gap-ss scores 1 if params say to do so
 231              */
 232             if ((!gap1 && !gap2) || params.includeGaps())
 233             {
 234               int seqDistance = SetUtils.countDisjunction(
 235                           secondaryStructure1, secondaryStructure2);
 236               distances[i][j] += seqDistance;
 237             }
 238           }
 239         }
 240       }
 241     }
 242
 243     /*
 244      * normalise the distance scores (summed over columns) by the
 245      * number of visible columns used in the calculation
 246      * and fill in the bottom half of the matrix
 247      */
 248     // TODO JAL-2424 cpwidth may be out by 1 - affects scores but not tree shape
 249     for (int i = 0; i < noseqs; i++)
 250     {
 251       for (int j = i + 1; j < noseqs; j++)
 252       {
 253         distances[i][j] /= cpwidth;
 254         distances[j][i] = distances[i][j];
 255       }
 256     }
 257     return new Matrix(distances);
 258   }
 259
 260   /**
 261    * Builds and returns a set containing sequences (SeqCigar) which do not
 262    * have a gap at the given column position.
 263    *
 264    * @param seqs
 265    * @param columnPosition
 266    *          (0..)
 267    * @return
 268    */
 269   protected Set<SeqCigar> findSeqsWithoutGapAtColumn(
 270           SeqCigar[] seqs, int columnPosition)
 271   {
 272     Set<SeqCigar> seqsWithoutGapAtCol = new HashSet<>();
 273     for (SeqCigar seq : seqs)
 274     {
 275       int spos = seq.findPosition(columnPosition);
 276       if (spos != -1)
 277       {
 278         /*
 279          * position is not a gap
 280          */
 281           seqsWithoutGapAtCol.add(seq);
 282       }
 283     }
 284     return seqsWithoutGapAtCol;
 285   }
 286
 287   /**
 288    * Builds and returns a set containing sequences (SeqCigar) which have
 289    * no secondary structures defined
 290    *
 291    * @param seqs
 292    *          (0..)
 293    * @return
 294    */
 295   private static final String[] SS_ANNOTATION_LABELS = {
 296       SS_ANNOTATION_LABEL,
 297       SS_ANNOTATION_FROM_JPRED_LABEL
 298   };
 299
 300   protected Set<SeqCigar> findSeqsWithUndefinedSS(SeqCigar[] seqs, Map<String, HashSet<String>> calcIdMapInAlignmentAnnotation) {
 301       Set<SeqCigar> seqsWithUndefinedSS = new HashSet<>();
 302       for (SeqCigar seq : seqs) {
 303           if (isSSUndefinedOrNotAdded(seq, calcIdMapInAlignmentAnnotation)) {
 304               seqsWithUndefinedSS.add(seq);
 305           }
 306       }
 307       return seqsWithUndefinedSS;
 308   }
 309
 310   private boolean isSSUndefinedOrNotAdded(SeqCigar seq, Map<String, HashSet<String>> calcIdMapInAlignmentAnnotation) {
 311       for (String label : SS_ANNOTATION_LABELS) {
 312           AlignmentAnnotation[] annotations = seq.getRefSeq().getAnnotation(label);
 313           if (annotations != null) {
 314               for (AlignmentAnnotation annotation : annotations) {
 315                 HashSet<String> descriptionList = calcIdMapInAlignmentAnnotation.get(annotation.getCalcId());
 316                   if (descriptionList.contains(annotation.description)) {
 317                       // Secondary structure annotation is present and added to the track, no need to add seq
 318                       return false;
 319                   }
 320               }
 321           }
 322       }
 323       // Either annotations are undefined or not added to the track
 324       return true;
 325   }
 326
 327
 328   /**
 329    * Finds secondary structure annotation for a given sequence (SeqCigar)
 330    * and column position corresponding to the sequence.
 331    *
 332    * @param seq
 333    * @param columnPosition
 334    *          (0..)
 335    * @return
 336    */
 337   private Set<String> findSSAnnotationForGivenSeqAndCol(
 338                   SeqCigar seq, int columnPosition)
 339   {
 340           Set<String> secondaryStructure = new HashSet<String>();
 341
 342           char ss;
 343
 344           //fetch the position in sequence for the column and finds the
 345           //corresponding secondary structure annotation
 346           //TO DO - consider based on priority
 347           int seqPosition = seq.findPosition(columnPosition);
 348           AlignmentAnnotation[] aa = seq.getRefSeq().getAnnotation(SS_ANNOTATION_LABEL);
 349
 350           if(aa == null) {
 351             aa = seq.getRefSeq().getAnnotation(SS_ANNOTATION_FROM_JPRED_LABEL);
 352           }
 353
 354           if (aa != null) {
 355             if (aa[0].getAnnotationForPosition(seqPosition) != null) {
 356               Annotation a = aa[0].getAnnotationForPosition(seqPosition);
 357               ss = a.secondaryStructure;
 358
 359               //There is no representation for coil and it can be either ' ' or null.
 360               if (ss == ' ' || ss == '-') {
 361                 ss = COIL;
 362               }
 363             }
 364             else {
 365               ss = COIL;
 366             }
 367             secondaryStructure.add(String.valueOf(ss));
 368           }
 369
 370           return secondaryStructure;
 371   }
 372
 373
 374   @Override
 375   public String getName()
 376   {
 377     return NAME;
 378   }
 379
 380   @Override
 381   public String getDescription()
 382   {
 383     return description;
 384   }
 385
 386   @Override
 387   public boolean isDNA()
 388   {
 389     return false;
 390   }
 391
 392   @Override
 393   public boolean isProtein()
 394   {
 395     return false;
 396   }
 397
 398   @Override
 399   public boolean isSecondaryStructure()
 400   {
 401     return true;
 402   }
 403
 404   @Override
 405   public String toString()
 406   {
 407     return "Score between sequences based on hamming distance between binary vectors marking secondary structure displayed at each column";
 408   }
 409 }