2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
21 package jalview.analysis.scoremodels;
23 import jalview.api.AlignmentViewPanel;
24 import jalview.api.FeatureRenderer;
25 import jalview.api.analysis.ScoreModelI;
26 import jalview.api.analysis.SimilarityParamsI;
27 import jalview.datamodel.AlignmentAnnotation;
28 import jalview.datamodel.AlignmentView;
29 import jalview.datamodel.Annotation;
30 import jalview.datamodel.SeqCigar;
31 import jalview.math.Matrix;
32 import jalview.math.MatrixI;
33 import jalview.util.SetUtils;
35 import java.util.HashMap;
36 import java.util.HashSet;
40 /* This class contains methods to calculate distance score between
41 * secondary structure annotations of the sequences. The inverse of
42 * the score is later calculated for similarity score.
44 public class SecondaryStructureDistanceModel extends DistanceScoreModel
46 private static final String NAME = "Secondary Structure Similarity";
48 //label in secondary structure annotation data model from 3d structures
49 private static final String SS_ANNOTATION_LABEL = "Secondary Structure";
51 //label in secondary structure annotation data model from JPred
52 private static final String SS_ANNOTATION_FROM_JPRED_LABEL = "jnetpred";
54 //label in secondary structure annotation data model from JPred
55 private static final String JPRED_WEBSERVICE = "JPred";
57 private String description;
59 //maximum distance score is defined as 2 as the possible number of unique secondary structure is 2.
60 private static final int MAX_SCORE = 2;
62 //minimum distance score is defined as 2 as the possible number of unique secondary structure is 2.
63 private static final int MIN_SCORE = 0;
65 //character used to represent coil in secondary structure
66 private static final char COIL = 'C';
71 * Annotation label for available sources of secondary structure
73 private static final String[] SS_ANNOTATION_LABELS = {
75 SS_ANNOTATION_FROM_JPRED_LABEL
81 public SecondaryStructureDistanceModel()
87 public ScoreModelI getInstance(AlignmentViewPanel view)
89 SecondaryStructureDistanceModel instance;
92 instance = this.getClass().getDeclaredConstructor().newInstance();
93 instance.configureFromAlignmentView(view);
95 } catch (InstantiationException | IllegalAccessException e)
97 jalview.bin.Console.errPrintln("Error in " + getClass().getName()
98 + ".getInstance(): " + e.getMessage());
100 } catch (ReflectiveOperationException roe)
106 boolean configureFromAlignmentView(AlignmentViewPanel view)
109 fr = view.cloneFeatureRenderer();
114 * Calculates a distance measure [i][j] between each pair of sequences as the
115 * average number of features they have but do not share. That is, find the
116 * features each sequence pair has at each column, ignore feature types they
117 * have in common, and count the rest. The totals are normalised by the number
118 * of columns processed.
120 * The parameters argument provides settings for treatment of gap-residue
121 * aligned positions, and whether the score is over the longer or shorter of
122 * each pair of sequences
129 * Calculates distance score [i][j] between each pair of protein sequences
130 * based on their secondary structure annotations (H, E, C). That is, find the
131 * secondary structures each sequence has at each column and scores positively for
132 * each non similar secondary structure annotations. Scores 0 for similar secondary
133 * structure annotations. The final score is normalized by the number of
134 * alignment columns processed, providing an average similarity score.
136 * The parameters argument can include settings for handling gap-residue aligned
137 * positions and may determine if the score calculation is based on the longer or shorter
138 * sequence in each pair. This can be important for handling partial alignments or
139 * sequences of significantly different lengths.
141 * @param seqData The aligned sequence data including secondary structure annotations.
142 * @param params Additional parameters for customizing the scoring process, such as gap
143 * handling and sequence length consideration.
146 public MatrixI findDistances(AlignmentView seqData,
147 SimilarityParamsI params)
150 SeqCigar[] seqs = seqData.getSequences();
151 int noseqs = seqs.length; //no of sequences
152 int cpwidth = 0; // = seqData.getWidth();
153 double[][] distances = new double[noseqs][noseqs]; //matrix to store distance score
154 double[][] substitutionMatrix = getSubstitutionMatrix();
155 //secondary structure source parameter selected by the user from the drop down.
156 String ssSource = params.getSecondaryStructureSource();
158 //defining the default value for secondary structure source as 3d structures
159 //or JPred if user selected JPred
160 String selectedSSSource = SS_ANNOTATION_LABEL;
161 if(ssSource.equals(JPRED_WEBSERVICE))
162 selectedSSSource = SS_ANNOTATION_FROM_JPRED_LABEL;
164 // need to get real position for view position
165 int[] viscont = seqData.getVisibleContigs();
168 * Add secondary structure annotations that are added to the annotation track
171 Map<String, HashSet<String>> ssAlignmentAnnotationForSequences
172 = new HashMap<String,HashSet<String>>();
174 AlignmentAnnotation[] alignAnnotList = fr.getViewport().getAlignment()
175 .getAlignmentAnnotation();
177 if(alignAnnotList.length > 0) {
178 for (AlignmentAnnotation aa: alignAnnotList) {
179 if (selectedSSSource.equals(aa.label)) {
180 ssAlignmentAnnotationForSequences.computeIfAbsent(
181 aa.sequenceRef.getName(), k -> new HashSet<>())
182 .add(aa.description);
188 * Get the set of sequences which are not considered for the calculation.
189 * Following sequences are added:
190 * 1. Sequences without a defined secondary structure from the selected
192 * 2. Sequences whose secondary structure annotations are not added to
193 * the annotation track
195 Set<SeqCigar> seqsWithUndefinedSS
196 = findSeqsWithUndefinedSS(seqs, ssAlignmentAnnotationForSequences);
199 * scan each column, compute and add to each distance[i, j]
200 * the number of secondary structure annotation that seqi
201 * and seqj do not share
203 for (int vc = 0; vc < viscont.length; vc += 2)
205 //Iterates for each column position
206 for (int cpos = viscont[vc]; cpos <= viscont[vc + 1]; cpos++)
208 cpwidth++; //used to normalise the distance score
211 * get set of sequences without gap in the current column
213 Set<SeqCigar> seqsWithoutGapAtCol = findSeqsWithoutGapAtColumn(seqs, cpos);
216 * count score for each dissimilar secondary structure annotation on i'th and j'th
217 * sequence. Igonre if similar and add this 'distance' measure to the total
218 * for [i, j] for j > i
220 for (int i = 0; i < (noseqs - 1); i++)
222 //Iterates for each sequences
223 for (int j = i + 1; j < noseqs; j++)
225 SeqCigar sc1 = seqs[i];
226 SeqCigar sc2 = seqs[j];
229 //check if ss is defined
230 boolean undefinedSS1 = seqsWithUndefinedSS.contains(sc1);
231 boolean undefinedSS2 = seqsWithUndefinedSS.contains(sc2);
233 // Set distance to 0 if both SS are not defined
234 if (undefinedSS1 && undefinedSS2) {
235 distances[i][j] += MIN_SCORE;
239 // Set distance to maximum score if either one SS is not defined
240 else if(undefinedSS1 || undefinedSS2) {
241 distances[i][j] += MAX_SCORE;
245 //check if the sequence contains gap in the current column
246 boolean gap1 = !seqsWithoutGapAtCol.contains(sc1);
247 boolean gap2 = !seqsWithoutGapAtCol.contains(sc2);
249 //Variable to store secondary structure at the current column
250 char ss1 = 'G', ss2 = 'G';
252 //secondary structure is fetched only if the current column is not
253 //gap for the sequence
254 if(!gap1 && !undefinedSS1) {
256 findSSAnnotationForGivenSeqAndCol(seqs[i], cpos);
259 if(!gap2 && !undefinedSS2) {
261 findSSAnnotationForGivenSeqAndCol(seqs[j], cpos);
265 * gap-gap scores zero
266 * similar ss-ss scores zero
267 * different ss-ss scores 1
268 * gap-ss scores 1 if params say to do so
270 if ((!gap1 && !gap2) || params.includeGaps())
272 // Calculate distance score based on the substitution matrix
273 double seqDistance = substitutionMatrix[getSubstitutionMatrixIndex(ss1)][getSubstitutionMatrixIndex(ss2)];
274 distances[i][j] += seqDistance;
282 * normalise the distance scores (summed over columns) by the
283 * number of visible columns used in the calculation
284 * and fill in the bottom half of the matrix
286 // TODO JAL-2424 cpwidth may be out by 1 - affects scores but not tree shape
287 for (int i = 0; i < noseqs; i++)
289 for (int j = i + 1; j < noseqs; j++)
291 distances[i][j] /= cpwidth;
292 distances[j][i] = distances[i][j];
295 return new Matrix(distances);
299 * Builds and returns a set containing sequences (SeqCigar) which do not
300 * have a gap at the given column position.
303 * @param columnPosition
307 protected Set<SeqCigar> findSeqsWithoutGapAtColumn(
308 SeqCigar[] seqs, int columnPosition)
310 Set<SeqCigar> seqsWithoutGapAtCol = new HashSet<>();
311 for (SeqCigar seq : seqs)
313 int spos = seq.findPosition(columnPosition);
317 * position is not a gap
319 seqsWithoutGapAtCol.add(seq);
322 return seqsWithoutGapAtCol;
327 * Builds and returns a set containing sequences (SeqCigar) which
328 * are not considered for the similarity calculation.
329 * Following sequences are added:
330 * 1. Sequences without a defined secondary structure from the selected
332 * 2. Sequences whose secondary structure annotations are not added to
333 * the annotation track
335 * @param ssAlignmentAnnotationForSequences
338 protected Set<SeqCigar> findSeqsWithUndefinedSS(SeqCigar[] seqs,
339 Map<String, HashSet<String>> ssAlignmentAnnotationForSequences) {
340 Set<SeqCigar> seqsWithUndefinedSS = new HashSet<>();
341 for (SeqCigar seq : seqs) {
342 if (isSSUndefinedOrNotAdded(seq, ssAlignmentAnnotationForSequences)) {
343 seqsWithUndefinedSS.add(seq);
346 return seqsWithUndefinedSS;
351 * Returns true if a sequence (SeqCigar) should not be
352 * considered for the similarity calculation.
353 * Following conditions are checked:
354 * 1. Sequence without a defined secondary structure from the selected
356 * 2. Sequences whose secondary structure annotations are not added to
357 * the annotation track
359 * @param ssAlignmentAnnotationForSequences
362 private boolean isSSUndefinedOrNotAdded(SeqCigar seq,
363 Map<String, HashSet<String>> ssAlignmentAnnotationForSequences) {
364 for (String label : SS_ANNOTATION_LABELS) {
365 AlignmentAnnotation[] annotations = seq.getRefSeq().getAnnotation(label);
366 if (annotations != null) {
367 for (AlignmentAnnotation annotation : annotations) {
368 HashSet<String> descriptionSet = ssAlignmentAnnotationForSequences
369 .get(annotation.sequenceRef.getName());
370 if (descriptionSet != null)
371 if (descriptionSet.contains(annotation.description)) {
372 // Secondary structure annotation is present and
373 //added to the track, no need to add seq
379 // Either annotations are undefined or not added to the track
385 * Finds secondary structure annotation for a given sequence (SeqCigar)
386 * and column position corresponding to the sequence.
389 * @param columnPosition
393 private char findSSAnnotationForGivenSeqAndCol(
394 SeqCigar seq, int columnPosition)
398 //fetch the position in sequence for the column and finds the
399 //corresponding secondary structure annotation
400 //TO DO - consider based on priority
401 int seqPosition = seq.findPosition(columnPosition);
402 AlignmentAnnotation[] aa = seq.getRefSeq().getAnnotation(SS_ANNOTATION_LABEL);
405 aa = seq.getRefSeq().getAnnotation(SS_ANNOTATION_FROM_JPRED_LABEL);
409 if (aa[0].getAnnotationForPosition(seqPosition) != null) {
410 Annotation a = aa[0].getAnnotationForPosition(seqPosition);
411 ss = a.secondaryStructure;
413 //There is no representation for coil and it can be either ' ' or null.
414 if (ss == ' ' || ss == '-') {
428 * Retrieve the substitution matrix.
430 * @return The substitution matrix.
432 private double[][] getSubstitutionMatrix() {
433 // Defining the substitution matrix
434 // This matrix map distance scores between secondary structure symbols
436 return new double[][]{
438 {0.0, 1.0, 1.0, 1.0}, // C - COIL
439 {1.0, 0.0, 1.0, 1.0}, // E - SHEET
440 {1.0, 1.0, 0.0, 1.0}, // H - HELIX
441 {1.0, 1.0, 1.0, 0.0} // G - GAP
446 private int getSubstitutionMatrixIndex(char ss) {
457 throw new IllegalArgumentException("Invalid secondary structure character: " + ss);
462 public String getName()
468 public String getDescription()
474 public boolean isDNA()
480 public boolean isProtein()
486 public boolean isSecondaryStructure()
492 public String toString()
494 return "Score between sequences based on hamming distance between binary "
495 + "vectors marking secondary structure displayed at each column";