2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
21 package jalview.analysis.scoremodels;
23 import jalview.api.AlignmentViewPanel;
24 import jalview.api.FeatureRenderer;
25 import jalview.api.analysis.ScoreModelI;
26 import jalview.api.analysis.SimilarityParamsI;
27 import jalview.datamodel.AlignmentAnnotation;
28 import jalview.datamodel.AlignmentView;
29 import jalview.datamodel.Annotation;
30 import jalview.datamodel.SeqCigar;
31 import jalview.math.Matrix;
32 import jalview.math.MatrixI;
33 import jalview.util.SetUtils;
35 import java.util.HashMap;
36 import java.util.HashSet;
40 /* This class contains methods to calculate distance score between
41 * secondary structure annotations of the sequences. The inverse of
42 * the score is later calculated for similarity score.
44 public class SecondaryStructureDistanceModel extends DistanceScoreModel
46 private static final String NAME = "Secondary Structure Similarity";
48 private static final String SS_ANNOTATION_LABEL = "Secondary Structure";
50 private static final String SS_ANNOTATION_FROM_JPRED_LABEL = "jnetpred";
52 private String description;
54 //maximum distance score is defined as 2 as the possible number of unique ss is 2.
55 private static final int MAX_SCORE = 2;
57 //minimum distance score is defined as 2 as the possible number of unique ss is 2.
58 private static final int MIN_SCORE = 0;
60 private static final char COIL = 'C';
67 public SecondaryStructureDistanceModel()
73 public ScoreModelI getInstance(AlignmentViewPanel view)
75 SecondaryStructureDistanceModel instance;
78 instance = this.getClass().getDeclaredConstructor().newInstance();
79 instance.configureFromAlignmentView(view);
81 } catch (InstantiationException | IllegalAccessException e)
83 jalview.bin.Console.errPrintln("Error in " + getClass().getName()
84 + ".getInstance(): " + e.getMessage());
86 } catch (ReflectiveOperationException roe)
92 boolean configureFromAlignmentView(AlignmentViewPanel view)
95 fr = view.cloneFeatureRenderer();
100 * Calculates a distance measure [i][j] between each pair of sequences as the
101 * average number of features they have but do not share. That is, find the
102 * features each sequence pair has at each column, ignore feature types they
103 * have in common, and count the rest. The totals are normalised by the number
104 * of columns processed.
106 * The parameters argument provides settings for treatment of gap-residue
107 * aligned positions, and whether the score is over the longer or shorter of
108 * each pair of sequences
115 * Calculates distance score [i][j] between each pair of protein sequences
116 * based on their secondary structure annotations (H, E, C). That is, find the
117 * secondary structures each sequence has at each column and scores positively for
118 * each non similar secondary structure annotations. Scores 0 for similar secondary
119 * structure annotations. The final score is normalized by the number of
120 * alignment columns processed, providing an average similarity score.
122 * The parameters argument can include settings for handling gap-residue aligned
123 * positions and may determine if the score calculation is based on the longer or shorter
124 * sequence in each pair. This can be important for handling partial alignments or
125 * sequences of significantly different lengths.
127 * @param seqData The aligned sequence data including secondary structure annotations.
128 * @param params Additional parameters for customizing the scoring process, such as gap
129 * handling and sequence length consideration.
132 public MatrixI findDistances(AlignmentView seqData,
133 SimilarityParamsI params)
136 SeqCigar[] seqs = seqData.getSequences();
137 int noseqs = seqs.length; //no of sequences
138 int cpwidth = 0; // = seqData.getWidth();
139 double[][] distances = new double[noseqs][noseqs]; //matrix to store distance score
141 // need to get real position for view position
142 int[] viscont = seqData.getVisibleContigs();
143 Map<String, HashSet<String>> calcIdMapInAlignmentAnnotation = new HashMap<String,HashSet<String>>();
145 AlignmentAnnotation[] alignAnnotList = fr.getViewport().getAlignment().getAlignmentAnnotation();
146 if(alignAnnotList.length > 0) {
148 for (AlignmentAnnotation aa: alignAnnotList) {
149 if (SS_ANNOTATION_LABEL.equals(aa.label) || SS_ANNOTATION_FROM_JPRED_LABEL.equals(aa.label)) {
150 calcIdMapInAlignmentAnnotation.computeIfAbsent(aa.getCalcId(), k -> new HashSet<>()).add(aa.description);
157 Set<SeqCigar> seqsWithUndefinedSS = findSeqsWithUndefinedSS(seqs, calcIdMapInAlignmentAnnotation);
161 * scan each column, compute and add to each distance[i, j]
162 * the number of secondary structure annotation that seqi
163 * and seqj do not share
165 for (int vc = 0; vc < viscont.length; vc += 2)
167 //Iterates for each column position
168 for (int cpos = viscont[vc]; cpos <= viscont[vc + 1]; cpos++)
170 cpwidth++; //used to normalise the distance score
173 * get set of sequences without gap in the current column
175 Set<SeqCigar> seqsWithoutGapAtCol = findSeqsWithoutGapAtColumn(seqs, cpos);
178 * count score for each dissimilar secondary structure annotation on i'th and j'th
179 * sequence. Igonre if similar and add this 'distance' measure to the total
180 * for [i, j] for j > i
182 for (int i = 0; i < (noseqs - 1); i++)
184 //Iterates for each sequences
185 for (int j = i + 1; j < noseqs; j++)
187 SeqCigar sc1 = seqs[i];
188 SeqCigar sc2 = seqs[j];
191 //check if ss is defined
192 boolean undefinedSS1 = seqsWithUndefinedSS.contains(sc1);
193 boolean undefinedSS2 = seqsWithUndefinedSS.contains(sc2);
195 // Set distance to 0 if both SS are not defined
196 if (undefinedSS1 && undefinedSS2) {
197 distances[i][j] += MIN_SCORE;
201 // Set distance to maximum score if either one SS is not defined
202 else if(undefinedSS1 || undefinedSS2) {
203 distances[i][j] += MAX_SCORE;
207 //check if the sequence contains gap in the current column
208 boolean gap1 = !seqsWithoutGapAtCol.contains(sc1);
209 boolean gap2 = !seqsWithoutGapAtCol.contains(sc2);
211 //Variable to store secondary structure at the current column
212 Set<String> secondaryStructure1 = new HashSet<String>();
213 Set<String> secondaryStructure2 = new HashSet<String>();
215 //secondary structure is fetched only if the current column is not
216 //gap for the sequence
217 if(!gap1 && !undefinedSS1) {
218 secondaryStructure1.addAll(
219 findSSAnnotationForGivenSeqAndCol(seqs[i], cpos));
222 if(!gap2 && !undefinedSS2) {
223 secondaryStructure2.addAll(
224 findSSAnnotationForGivenSeqAndCol(seqs[j], cpos));
228 * gap-gap always scores zero
229 * ss-ss is always scored
230 * include gap-ss scores 1 if params say to do so
232 if ((!gap1 && !gap2) || params.includeGaps())
234 int seqDistance = SetUtils.countDisjunction(
235 secondaryStructure1, secondaryStructure2);
236 distances[i][j] += seqDistance;
244 * normalise the distance scores (summed over columns) by the
245 * number of visible columns used in the calculation
246 * and fill in the bottom half of the matrix
248 // TODO JAL-2424 cpwidth may be out by 1 - affects scores but not tree shape
249 for (int i = 0; i < noseqs; i++)
251 for (int j = i + 1; j < noseqs; j++)
253 distances[i][j] /= cpwidth;
254 distances[j][i] = distances[i][j];
257 return new Matrix(distances);
261 * Builds and returns a set containing sequences (SeqCigar) which do not
262 * have a gap at the given column position.
265 * @param columnPosition
269 protected Set<SeqCigar> findSeqsWithoutGapAtColumn(
270 SeqCigar[] seqs, int columnPosition)
272 Set<SeqCigar> seqsWithoutGapAtCol = new HashSet<>();
273 for (SeqCigar seq : seqs)
275 int spos = seq.findPosition(columnPosition);
279 * position is not a gap
281 seqsWithoutGapAtCol.add(seq);
284 return seqsWithoutGapAtCol;
288 * Builds and returns a set containing sequences (SeqCigar) which have
289 * no secondary structures defined
295 private static final String[] SS_ANNOTATION_LABELS = {
297 SS_ANNOTATION_FROM_JPRED_LABEL
300 protected Set<SeqCigar> findSeqsWithUndefinedSS(SeqCigar[] seqs, Map<String, HashSet<String>> calcIdMapInAlignmentAnnotation) {
301 Set<SeqCigar> seqsWithUndefinedSS = new HashSet<>();
302 for (SeqCigar seq : seqs) {
303 if (isSSUndefinedOrNotAdded(seq, calcIdMapInAlignmentAnnotation)) {
304 seqsWithUndefinedSS.add(seq);
307 return seqsWithUndefinedSS;
310 private boolean isSSUndefinedOrNotAdded(SeqCigar seq, Map<String, HashSet<String>> calcIdMapInAlignmentAnnotation) {
311 for (String label : SS_ANNOTATION_LABELS) {
312 AlignmentAnnotation[] annotations = seq.getRefSeq().getAnnotation(label);
313 if (annotations != null) {
314 for (AlignmentAnnotation annotation : annotations) {
315 HashSet<String> descriptionList = calcIdMapInAlignmentAnnotation.get(annotation.getCalcId());
316 if (descriptionList.contains(annotation.description)) {
317 // Secondary structure annotation is present and added to the track, no need to add seq
323 // Either annotations are undefined or not added to the track
329 * Finds secondary structure annotation for a given sequence (SeqCigar)
330 * and column position corresponding to the sequence.
333 * @param columnPosition
337 private Set<String> findSSAnnotationForGivenSeqAndCol(
338 SeqCigar seq, int columnPosition)
340 Set<String> secondaryStructure = new HashSet<String>();
344 //fetch the position in sequence for the column and finds the
345 //corresponding secondary structure annotation
346 //TO DO - consider based on priority
347 int seqPosition = seq.findPosition(columnPosition);
348 AlignmentAnnotation[] aa = seq.getRefSeq().getAnnotation(SS_ANNOTATION_LABEL);
351 aa = seq.getRefSeq().getAnnotation(SS_ANNOTATION_FROM_JPRED_LABEL);
355 if (aa[0].getAnnotationForPosition(seqPosition) != null) {
356 Annotation a = aa[0].getAnnotationForPosition(seqPosition);
357 ss = a.secondaryStructure;
359 //There is no representation for coil and it can be either ' ' or null.
360 if (ss == ' ' || ss == '-') {
367 secondaryStructure.add(String.valueOf(ss));
370 return secondaryStructure;
375 public String getName()
381 public String getDescription()
387 public boolean isDNA()
393 public boolean isProtein()
399 public boolean isSecondaryStructure()
405 public String toString()
407 return "Score between sequences based on hamming distance between binary vectors marking secondary structure displayed at each column";