src/jalview/analysis/scoremodels/SecondaryStructureDistanceModel.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.analysis.scoremodels;
  22
  23 import jalview.api.AlignmentViewPanel;
  24 import jalview.api.FeatureRenderer;
  25 import jalview.api.analysis.ScoreModelI;
  26 import jalview.api.analysis.SimilarityParamsI;
  27 import jalview.datamodel.AlignmentAnnotation;
  28 import jalview.datamodel.AlignmentView;
  29 import jalview.datamodel.Annotation;
  30 import jalview.datamodel.SeqCigar;
  31 import jalview.math.Matrix;
  32 import jalview.math.MatrixI;
  33 import jalview.util.SetUtils;
  34
  35 import java.util.HashMap;
  36 import java.util.HashSet;
  37 import java.util.Map;
  38 import java.util.Set;
  39
  40 /* This class contains methods to calculate distance score between
  41  * secondary structure annotations of the sequences. The inverse of
  42  * the score is later calculated for similarity score.
  43  */
  44 public class SecondaryStructureDistanceModel extends DistanceScoreModel
  45 {
  46   private static final String NAME = "Secondary Structure Similarity";
  47
  48   //label in secondary structure annotation data model from 3d structures
  49   private static final String SS_ANNOTATION_LABEL = "Secondary Structure";
  50
  51   //label in secondary structure annotation data model from JPred
  52   private static final String SS_ANNOTATION_FROM_JPRED_LABEL = "jnetpred";
  53
  54   //label in secondary structure annotation data model from JPred
  55   private static final String JPRED_WEBSERVICE = "JPred";
  56
  57   private String description;
  58
  59   //maximum distance score is defined as 2 as the possible number of unique secondary structure is 2.
  60   private static final int MAX_SCORE = 2;
  61
  62   //minimum distance score is defined as 2 as the possible number of unique secondary structure is 2.
  63   private static final int MIN_SCORE = 0;
  64
  65   //character used to represent coil in secondary structure
  66   private static final char COIL = 'C';
  67
  68   FeatureRenderer fr;
  69
  70   /*
  71    * Annotation label for available sources of secondary structure
  72    */
  73   private static final String[] SS_ANNOTATION_LABELS = {
  74       SS_ANNOTATION_LABEL,
  75       SS_ANNOTATION_FROM_JPRED_LABEL
  76   };
  77
  78   /**
  79    * Constructor
  80    */
  81   public SecondaryStructureDistanceModel()
  82   {
  83
  84   }
  85
  86   @Override
  87   public ScoreModelI getInstance(AlignmentViewPanel view)
  88   {
  89     SecondaryStructureDistanceModel instance;
  90     try
  91     {
  92       instance = this.getClass().getDeclaredConstructor().newInstance();
  93       instance.configureFromAlignmentView(view);
  94       return instance;
  95     } catch (InstantiationException | IllegalAccessException e)
  96     {
  97       jalview.bin.Console.errPrintln("Error in " + getClass().getName()
  98               + ".getInstance(): " + e.getMessage());
  99       return null;
 100     } catch (ReflectiveOperationException roe)
 101     {
 102       return null;
 103     }
 104   }
 105
 106   boolean configureFromAlignmentView(AlignmentViewPanel view)
 107
 108   {
 109     fr = view.cloneFeatureRenderer();
 110     return true;
 111   }
 112
 113   /**
 114    * Calculates a distance measure [i][j] between each pair of sequences as the
 115    * average number of features they have but do not share. That is, find the
 116    * features each sequence pair has at each column, ignore feature types they
 117    * have in common, and count the rest. The totals are normalised by the number
 118    * of columns processed.
 119    * <p>
 120    * The parameters argument provides settings for treatment of gap-residue
 121    * aligned positions, and whether the score is over the longer or shorter of
 122    * each pair of sequences
 123    *
 124    * @param seqData
 125    * @param params
 126    */
 127
 128   /**
 129    * Calculates distance score [i][j] between each pair of protein sequences
 130    * based on their secondary structure annotations (H, E, C). That is, find the
 131    * secondary structures each sequence has at each column and scores positively for
 132    * each non similar secondary structure annotations. Scores 0 for similar secondary
 133    * structure annotations. The final score is normalized by the number of
 134    * alignment columns processed, providing an average similarity score.
 135    * <p>
 136    * The parameters argument can include settings for handling gap-residue aligned
 137    * positions and may determine if the score calculation is based on the longer or shorter
 138    * sequence in each pair. This can be important for handling partial alignments or
 139    * sequences of significantly different lengths.
 140    *
 141    * @param seqData  The aligned sequence data including secondary structure annotations.
 142    * @param params   Additional parameters for customizing the scoring process, such as gap
 143    *                 handling and sequence length consideration.
 144    */
 145   @Override
 146   public MatrixI findDistances(AlignmentView seqData,
 147           SimilarityParamsI params)
 148   {
 149
 150     SeqCigar[] seqs = seqData.getSequences();
 151     int noseqs = seqs.length; //no of sequences
 152     int cpwidth = 0; // = seqData.getWidth();
 153     double[][] distances = new double[noseqs][noseqs]; //matrix to store distance score
 154     //secondary structure source parameter selected by the user from the drop down.
 155     String ssSource = params.getSecondaryStructureSource();
 156
 157     //defining the default value for secondary structure source as 3d structures
 158     //or JPred if user selected JPred
 159     String selectedSSSource = SS_ANNOTATION_LABEL;
 160     if(ssSource.equals(JPRED_WEBSERVICE))
 161       selectedSSSource = SS_ANNOTATION_FROM_JPRED_LABEL;
 162
 163     // need to get real position for view position
 164     int[] viscont = seqData.getVisibleContigs();
 165
 166     /*
 167      * Add secondary structure annotations that are added to the annotation track
 168      * to the map
 169      */
 170     Map<String, HashSet<String>> ssAlignmentAnnotationForSequences
 171       = new HashMap<String,HashSet<String>>();
 172
 173     AlignmentAnnotation[] alignAnnotList = fr.getViewport().getAlignment()
 174             .getAlignmentAnnotation();
 175
 176     if(alignAnnotList.length > 0) {
 177       for (AlignmentAnnotation aa: alignAnnotList) {
 178         if (selectedSSSource.equals(aa.label)) {
 179           ssAlignmentAnnotationForSequences.computeIfAbsent(
 180                   aa.sequenceRef.getName(), k -> new HashSet<>())
 181           .add(aa.description);
 182         }
 183       }
 184     }
 185
 186     /*
 187      * Get the set of sequences which are not considered for the calculation.
 188      * Following sequences are added:
 189      * 1. Sequences without a defined secondary structure from the selected
 190      * source.
 191      * 2. Sequences whose secondary structure annotations are not added to
 192      * the annotation track
 193      */
 194     Set<SeqCigar> seqsWithUndefinedSS
 195         = findSeqsWithUndefinedSS(seqs, ssAlignmentAnnotationForSequences);
 196
 197     /*
 198      * scan each column, compute and add to each distance[i, j]
 199      * the number of secondary structure annotation that seqi
 200      * and seqj do not share
 201      */
 202     for (int vc = 0; vc < viscont.length; vc += 2)
 203     {
 204       //Iterates for each column position
 205       for (int cpos = viscont[vc]; cpos <= viscont[vc + 1]; cpos++)
 206       {
 207         cpwidth++; //used to normalise the distance score
 208
 209         /*
 210          * get set of sequences without gap in the current column
 211          */
 212         Set<SeqCigar> seqsWithoutGapAtCol = findSeqsWithoutGapAtColumn(seqs, cpos);
 213
 214         /*
 215          * count score for each dissimilar secondary structure annotation on i'th and j'th
 216          * sequence. Igonre if similar and add this 'distance' measure to the total
 217          * for [i, j] for j > i
 218          */
 219         for (int i = 0; i < (noseqs - 1); i++)
 220         {
 221           //Iterates for each sequences
 222           for (int j = i + 1; j < noseqs; j++)
 223           {
 224             SeqCigar sc1 = seqs[i];
 225             SeqCigar sc2 = seqs[j];
 226
 227
 228             //check if ss is defined
 229             boolean undefinedSS1 = seqsWithUndefinedSS.contains(sc1);
 230             boolean undefinedSS2 = seqsWithUndefinedSS.contains(sc2);
 231
 232             // Set distance to 0 if both SS are not defined
 233             if (undefinedSS1 && undefinedSS2) {
 234                 distances[i][j] += MIN_SCORE;
 235                 continue;
 236             }
 237
 238             // Set distance to maximum score if either one SS is not defined
 239             else if(undefinedSS1 || undefinedSS2) {
 240                 distances[i][j] += MAX_SCORE;
 241                 continue;
 242             }
 243
 244             //check if the sequence contains gap in the current column
 245             boolean gap1 = !seqsWithoutGapAtCol.contains(sc1);
 246             boolean gap2 = !seqsWithoutGapAtCol.contains(sc2);
 247
 248             //Variable to store secondary structure at the current column
 249             Set<String> secondaryStructure1 = new HashSet<String>();
 250             Set<String> secondaryStructure2 = new HashSet<String>();
 251
 252             //secondary structure is fetched only if the current column is not
 253             //gap for the sequence
 254             if(!gap1 && !undefinedSS1) {
 255               secondaryStructure1.addAll(
 256                   findSSAnnotationForGivenSeqAndCol(seqs[i], cpos));
 257             }
 258
 259             if(!gap2 && !undefinedSS2) {
 260               secondaryStructure2.addAll(
 261                   findSSAnnotationForGivenSeqAndCol(seqs[j], cpos));
 262             }
 263
 264             /*
 265              * gap-gap always scores zero
 266              * ss-ss is always scored
 267              * include gap-ss scores 1 if params say to do so
 268              */
 269             if ((!gap1 && !gap2) || params.includeGaps())
 270             {
 271               int seqDistance = SetUtils.countDisjunction(
 272                   secondaryStructure1, secondaryStructure2);
 273               distances[i][j] += seqDistance;
 274             }
 275           }
 276         }
 277       }
 278     }
 279
 280     /*
 281      * normalise the distance scores (summed over columns) by the
 282      * number of visible columns used in the calculation
 283      * and fill in the bottom half of the matrix
 284      */
 285     // TODO JAL-2424 cpwidth may be out by 1 - affects scores but not tree shape
 286     for (int i = 0; i < noseqs; i++)
 287     {
 288       for (int j = i + 1; j < noseqs; j++)
 289       {
 290         distances[i][j] /= cpwidth;
 291         distances[j][i] = distances[i][j];
 292       }
 293     }
 294     return new Matrix(distances);
 295   }
 296
 297   /**
 298    * Builds and returns a set containing sequences (SeqCigar) which do not
 299    * have a gap at the given column position.
 300    *
 301    * @param seqs
 302    * @param columnPosition
 303    *          (0..)
 304    * @return
 305    */
 306   protected Set<SeqCigar> findSeqsWithoutGapAtColumn(
 307           SeqCigar[] seqs, int columnPosition)
 308   {
 309     Set<SeqCigar> seqsWithoutGapAtCol = new HashSet<>();
 310     for (SeqCigar seq : seqs)
 311     {
 312       int spos = seq.findPosition(columnPosition);
 313       if (spos != -1)
 314       {
 315         /*
 316          * position is not a gap
 317          */
 318         seqsWithoutGapAtCol.add(seq);
 319       }
 320     }
 321     return seqsWithoutGapAtCol;
 322   }
 323
 324
 325   /**
 326    * Builds and returns a set containing sequences (SeqCigar) which
 327    * are not considered for the similarity calculation.
 328    * Following sequences are added:
 329    * 1. Sequences without a defined secondary structure from the selected
 330    * source.
 331    * 2. Sequences whose secondary structure annotations are not added to
 332    * the annotation track
 333    * @param seqs
 334    * @param ssAlignmentAnnotationForSequences
 335    * @return
 336    */
 337   protected Set<SeqCigar> findSeqsWithUndefinedSS(SeqCigar[] seqs,
 338           Map<String, HashSet<String>> ssAlignmentAnnotationForSequences) {
 339       Set<SeqCigar> seqsWithUndefinedSS = new HashSet<>();
 340       for (SeqCigar seq : seqs) {
 341           if (isSSUndefinedOrNotAdded(seq, ssAlignmentAnnotationForSequences)) {
 342               seqsWithUndefinedSS.add(seq);
 343           }
 344       }
 345       return seqsWithUndefinedSS;
 346   }
 347
 348
 349   /**
 350    * Returns true if a sequence (SeqCigar) should not be
 351    * considered for the similarity calculation.
 352    * Following conditions are checked:
 353    * 1. Sequence without a defined secondary structure from the selected
 354    * source.
 355    * 2. Sequences whose secondary structure annotations are not added to
 356    * the annotation track
 357    * @param seq
 358    * @param ssAlignmentAnnotationForSequences
 359    * @return
 360    */
 361   private boolean isSSUndefinedOrNotAdded(SeqCigar seq,
 362           Map<String, HashSet<String>> ssAlignmentAnnotationForSequences) {
 363       for (String label : SS_ANNOTATION_LABELS) {
 364           AlignmentAnnotation[] annotations = seq.getRefSeq().getAnnotation(label);
 365           if (annotations != null) {
 366               for (AlignmentAnnotation annotation : annotations) {
 367                 HashSet<String> descriptionSet = ssAlignmentAnnotationForSequences
 368                         .get(annotation.sequenceRef.getName());
 369                 if (descriptionSet != null)
 370                   if (descriptionSet.contains(annotation.description)) {
 371                       // Secondary structure annotation is present and
 372                       //added to the track, no need to add seq
 373                       return false;
 374                   }
 375               }
 376           }
 377       }
 378       // Either annotations are undefined or not added to the track
 379       return true;
 380   }
 381
 382
 383   /**
 384    * Finds secondary structure annotation for a given sequence (SeqCigar)
 385    * and column position corresponding to the sequence.
 386    *
 387    * @param seq
 388    * @param columnPosition
 389    *          (0..)
 390    * @return
 391    */
 392   private Set<String> findSSAnnotationForGivenSeqAndCol(
 393       SeqCigar seq, int columnPosition)
 394   {
 395     Set<String> secondaryStructure = new HashSet<String>();
 396
 397     char ss;
 398
 399     //fetch the position in sequence for the column and finds the
 400     //corresponding secondary structure annotation
 401     //TO DO - consider based on priority
 402     int seqPosition = seq.findPosition(columnPosition);
 403     AlignmentAnnotation[] aa = seq.getRefSeq().getAnnotation(SS_ANNOTATION_LABEL);
 404
 405     if(aa == null) {
 406       aa = seq.getRefSeq().getAnnotation(SS_ANNOTATION_FROM_JPRED_LABEL);
 407     }
 408
 409     if (aa != null) {
 410       if (aa[0].getAnnotationForPosition(seqPosition) != null) {
 411         Annotation a = aa[0].getAnnotationForPosition(seqPosition);
 412         ss = a.secondaryStructure;
 413
 414         //There is no representation for coil and it can be either ' ' or null.
 415         if (ss == ' ' || ss == '-') {
 416           ss = COIL;
 417         }
 418       }
 419       else {
 420         ss = COIL;
 421       }
 422       secondaryStructure.add(String.valueOf(ss));
 423     }
 424
 425     return secondaryStructure;
 426   }
 427
 428
 429   @Override
 430   public String getName()
 431   {
 432     return NAME;
 433   }
 434
 435   @Override
 436   public String getDescription()
 437   {
 438     return description;
 439   }
 440
 441   @Override
 442   public boolean isDNA()
 443   {
 444     return false;
 445   }
 446
 447   @Override
 448   public boolean isProtein()
 449   {
 450     return false;
 451   }
 452
 453   @Override
 454   public boolean isSecondaryStructure()
 455   {
 456     return true;
 457   }
 458
 459   @Override
 460   public String toString()
 461   {
 462     return "Score between sequences based on hamming distance between binary "
 463             + "vectors marking secondary structure displayed at each column";
 464   }
 465 }