src/jalview/analysis/scoremodels/SecondaryStructureDistanceModel.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.analysis.scoremodels;
  22
  23 import jalview.api.AlignmentViewPanel;
  24 import jalview.api.FeatureRenderer;
  25 import jalview.api.analysis.ScoreModelI;
  26 import jalview.api.analysis.SimilarityParamsI;
  27 import jalview.datamodel.AlignmentAnnotation;
  28 import jalview.datamodel.AlignmentView;
  29 import jalview.datamodel.Annotation;
  30 import jalview.datamodel.SeqCigar;
  31 import jalview.math.Matrix;
  32 import jalview.math.MatrixI;
  33 import jalview.util.SetUtils;
  34
  35 import java.util.HashSet;
  36 import java.util.Set;
  37
  38 /* This class contains methods to calculate distance score between
  39  * secondary structure annotations of the sequences. The inverse of
  40  * the score is later calculated for similarity score.
  41  */
  42 public class SecondaryStructureDistanceModel extends DistanceScoreModel
  43 {
  44   private static final String NAME = "Secondary Structure Similarity";
  45
  46   private String description;
  47
  48   FeatureRenderer fr;
  49
  50   /**
  51    * Constructor
  52    */
  53   public SecondaryStructureDistanceModel()
  54   {
  55
  56   }
  57
  58   @Override
  59   public ScoreModelI getInstance(AlignmentViewPanel view)
  60   {
  61     SecondaryStructureDistanceModel instance;
  62     try
  63     {
  64       instance = this.getClass().getDeclaredConstructor().newInstance();
  65       instance.configureFromAlignmentView(view);
  66       return instance;
  67     } catch (InstantiationException | IllegalAccessException e)
  68     {
  69       jalview.bin.Console.errPrintln("Error in " + getClass().getName()
  70               + ".getInstance(): " + e.getMessage());
  71       return null;
  72     } catch (ReflectiveOperationException roe)
  73     {
  74       return null;
  75     }
  76   }
  77
  78   boolean configureFromAlignmentView(AlignmentViewPanel view)
  79
  80   {
  81     fr = view.cloneFeatureRenderer();
  82     return true;
  83   }
  84
  85   /**
  86    * Calculates a distance measure [i][j] between each pair of sequences as the
  87    * average number of features they have but do not share. That is, find the
  88    * features each sequence pair has at each column, ignore feature types they
  89    * have in common, and count the rest. The totals are normalised by the number
  90    * of columns processed.
  91    * <p>
  92    * The parameters argument provides settings for treatment of gap-residue
  93    * aligned positions, and whether the score is over the longer or shorter of
  94    * each pair of sequences
  95    *
  96    * @param seqData
  97    * @param params
  98    */
  99
 100   /**
 101    * Calculates distance score [i][j] between each pair of protein sequences
 102    * based on their secondary structure annotations (H, E, C). That is, find the
 103    * secondary structures each sequence has at each column and scores positively for
 104    * each non similar secondary structure annotations. Scores 0 for similar secondary
 105    * structure annotations. The final score is normalized by the number of
 106    * alignment columns processed, providing an average similarity score.
 107    * <p>
 108    * The parameters argument can include settings for handling gap-residue aligned
 109    * positions and may determine if the score calculation is based on the longer or shorter
 110    * sequence in each pair. This can be important for handling partial alignments or
 111    * sequences of significantly different lengths.
 112    *
 113    * @param seqData  The aligned sequence data including secondary structure annotations.
 114    * @param params   Additional parameters for customizing the scoring process, such as gap
 115    *                 handling and sequence length consideration.
 116    */
 117   @Override
 118   public MatrixI findDistances(AlignmentView seqData,
 119           SimilarityParamsI params)
 120   {
 121     SeqCigar[] seqs = seqData.getSequences();
 122     int noseqs = seqs.length; //no of sequences
 123     int cpwidth = 0; // = seqData.getWidth();
 124     double[][] distances = new double[noseqs][noseqs]; //matrix to store distance score
 125
 126     // need to get real position for view position
 127     int[] viscont = seqData.getVisibleContigs();
 128
 129     /*
 130      * scan each column, compute and add to each distance[i, j]
 131      * the number of secondary structure annotation that seqi
 132      * and seqj do not share
 133      */
 134     for (int vc = 0; vc < viscont.length; vc += 2)
 135     {
 136       //Iterates for each column position
 137       for (int cpos = viscont[vc]; cpos <= viscont[vc + 1]; cpos++)
 138       {
 139         cpwidth++; //used to normalise the distance score
 140
 141         /*
 142          * get set of sequences without gap in the current column
 143          */
 144         Set<SeqCigar> seqsWithoutGapAtCol = findSeqsWithoutGapAtColumn(seqs, cpos);
 145
 146         /*
 147          * count score for each dissimilar secondary structure annotation on i'th and j'th
 148          * sequence. Igonre if similar and add this 'distance' measure to the total
 149          * for [i, j] for j > i
 150          */
 151         for (int i = 0; i < (noseqs - 1); i++)
 152         {
 153           //Iterates for each sequences
 154           for (int j = i + 1; j < noseqs; j++)
 155           {
 156             SeqCigar sc1 = seqs[i];
 157             SeqCigar sc2 = seqs[j];
 158             boolean gap1 = !seqsWithoutGapAtCol.contains(sc1);
 159             boolean gap2 = !seqsWithoutGapAtCol.contains(sc2);
 160
 161             //Variable to store secondary structure at the current column
 162             Set<String> secondaryStructure1 = new HashSet<String>();
 163             Set<String> secondaryStructure2 = new HashSet<String>();
 164
 165             //secondary structure is fetched only if the current column is not
 166             //gap for the sequence
 167             if(!gap1) {
 168                 secondaryStructure1.addAll(
 169                                 findSSAnnotationForGivenSeqAndCol(seqs[i], cpos));
 170             }
 171
 172             if(!gap2) {
 173                 secondaryStructure2.addAll(
 174                                 findSSAnnotationForGivenSeqAndCol(seqs[j], cpos));
 175             }
 176
 177             /*
 178              * gap-gap always scores zero
 179              * residue-residue is always scored
 180              * include gap-residue score if params say to do so
 181              */
 182             if ((!gap1 && !gap2) || params.includeGaps())
 183             {
 184               int seqDistance = SetUtils.countDisjunction(
 185                           secondaryStructure1, secondaryStructure2);
 186               distances[i][j] += seqDistance;
 187             }
 188           }
 189         }
 190       }
 191     }
 192
 193     /*
 194      * normalise the distance scores (summed over columns) by the
 195      * number of visible columns used in the calculation
 196      * and fill in the bottom half of the matrix
 197      */
 198     // TODO JAL-2424 cpwidth may be out by 1 - affects scores but not tree shape
 199     for (int i = 0; i < noseqs; i++)
 200     {
 201       for (int j = i + 1; j < noseqs; j++)
 202       {
 203         distances[i][j] /= cpwidth;
 204         distances[j][i] = distances[i][j];
 205       }
 206     }
 207     return new Matrix(distances);
 208   }
 209
 210   /**
 211    * Builds and returns a set containing sequences (SeqCigar) which do not
 212    * have a gap at the given column position.
 213    *
 214    * @param seqs
 215    * @param columnPosition
 216    *          (0..)
 217    * @return
 218    */
 219   protected Set<SeqCigar> findSeqsWithoutGapAtColumn(
 220           SeqCigar[] seqs, int columnPosition)
 221   {
 222     Set<SeqCigar> seqsWithoutGapAtCol = new HashSet<>();
 223     for (SeqCigar seq : seqs)
 224     {
 225       int spos = seq.findPosition(columnPosition);
 226       if (spos != -1)
 227       {
 228         /*
 229          * position is not a gap
 230          */
 231           seqsWithoutGapAtCol.add(seq);
 232       }
 233     }
 234     return seqsWithoutGapAtCol;
 235   }
 236
 237   /**
 238    * Finds secondary structure annotation for a given sequence (SeqCigar)
 239    * and column position corresponding to the sequence.
 240    *
 241    * @param seq
 242    * @param columnPosition
 243    *          (0..)
 244    * @return
 245    */
 246   private Set<String> findSSAnnotationForGivenSeqAndCol(
 247                   SeqCigar seq, int columnPosition)
 248   {
 249           Set<String> secondaryStructure = new HashSet<String>();
 250
 251           char ss = '\0'; //default null character
 252
 253           //fetch the position in sequence for the column and finds the
 254           //corresponding secondary structure annotation
 255           int seqPosition = seq.findPosition(columnPosition);
 256           AlignmentAnnotation[] aa = seq.getRefSeq().getAnnotation("Secondary Structure");
 257           if (aa != null) {
 258               Annotation a = aa[0].getAnnotationForPosition(seqPosition);
 259               ss = a.secondaryStructure;
 260               if (ss == ' ') {
 261                   ss = 'C'; // In JalView, 'C' is represented as ' '
 262               }
 263               if (ss != '\0') { // Check if ss is not the default null character
 264                   secondaryStructure.add(String.valueOf(ss));
 265               }
 266           }
 267           return secondaryStructure;
 268   }
 269
 270
 271   @Override
 272   public String getName()
 273   {
 274     return NAME;
 275   }
 276
 277   @Override
 278   public String getDescription()
 279   {
 280     return description;
 281   }
 282
 283   @Override
 284   public boolean isDNA()
 285   {
 286     return false;
 287   }
 288
 289   @Override
 290   public boolean isProtein()
 291   {
 292     return false;
 293   }
 294
 295   @Override
 296   public boolean isSecondaryStructure()
 297   {
 298     return true;
 299   }
 300
 301   @Override
 302   public String toString()
 303   {
 304     return "Score between sequences based on hamming distance between binary vectors marking features displayed at each column";
 305   }
 306 }