2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
21 package jalview.analysis.scoremodels;
23 import jalview.api.AlignmentViewPanel;
24 import jalview.api.FeatureRenderer;
25 import jalview.api.analysis.ScoreModelI;
26 import jalview.api.analysis.SimilarityParamsI;
27 import jalview.datamodel.AlignmentAnnotation;
28 import jalview.datamodel.AlignmentView;
29 import jalview.datamodel.Annotation;
30 import jalview.datamodel.SeqCigar;
31 import jalview.math.Matrix;
32 import jalview.math.MatrixI;
33 import jalview.util.SetUtils;
35 import java.util.HashSet;
38 /* This class contains methods to calculate distance score between
39 * secondary structure annotations of the sequences. The inverse of
40 * the score is later calculated for similarity score.
42 public class SecondaryStructureDistanceModel extends DistanceScoreModel
44 private static final String NAME = "Secondary Structure Similarity";
46 private static final String SS_ANNOTATION_LABEL = "Secondary Structure";
48 private String description;
50 //maximum distance score is defined as 2 as the possible number of unique ss is 2.
51 private static final int MAX_SCORE = 2;
53 //minimum distance score is defined as 2 as the possible number of unique ss is 2.
54 private static final int MIN_SCORE = 0;
56 private static final char COIL = 'C';
63 public SecondaryStructureDistanceModel()
69 public ScoreModelI getInstance(AlignmentViewPanel view)
71 SecondaryStructureDistanceModel instance;
74 instance = this.getClass().getDeclaredConstructor().newInstance();
75 instance.configureFromAlignmentView(view);
77 } catch (InstantiationException | IllegalAccessException e)
79 jalview.bin.Console.errPrintln("Error in " + getClass().getName()
80 + ".getInstance(): " + e.getMessage());
82 } catch (ReflectiveOperationException roe)
88 boolean configureFromAlignmentView(AlignmentViewPanel view)
91 fr = view.cloneFeatureRenderer();
96 * Calculates a distance measure [i][j] between each pair of sequences as the
97 * average number of features they have but do not share. That is, find the
98 * features each sequence pair has at each column, ignore feature types they
99 * have in common, and count the rest. The totals are normalised by the number
100 * of columns processed.
102 * The parameters argument provides settings for treatment of gap-residue
103 * aligned positions, and whether the score is over the longer or shorter of
104 * each pair of sequences
111 * Calculates distance score [i][j] between each pair of protein sequences
112 * based on their secondary structure annotations (H, E, C). That is, find the
113 * secondary structures each sequence has at each column and scores positively for
114 * each non similar secondary structure annotations. Scores 0 for similar secondary
115 * structure annotations. The final score is normalized by the number of
116 * alignment columns processed, providing an average similarity score.
118 * The parameters argument can include settings for handling gap-residue aligned
119 * positions and may determine if the score calculation is based on the longer or shorter
120 * sequence in each pair. This can be important for handling partial alignments or
121 * sequences of significantly different lengths.
123 * @param seqData The aligned sequence data including secondary structure annotations.
124 * @param params Additional parameters for customizing the scoring process, such as gap
125 * handling and sequence length consideration.
128 public MatrixI findDistances(AlignmentView seqData,
129 SimilarityParamsI params)
131 SeqCigar[] seqs = seqData.getSequences();
132 int noseqs = seqs.length; //no of sequences
133 int cpwidth = 0; // = seqData.getWidth();
134 double[][] distances = new double[noseqs][noseqs]; //matrix to store distance score
136 // need to get real position for view position
137 int[] viscont = seqData.getVisibleContigs();
139 Set<SeqCigar> seqsWithUndefinedSS = findSeqsWithUndefinedSS(seqs);
143 * scan each column, compute and add to each distance[i, j]
144 * the number of secondary structure annotation that seqi
145 * and seqj do not share
147 for (int vc = 0; vc < viscont.length; vc += 2)
149 //Iterates for each column position
150 for (int cpos = viscont[vc]; cpos <= viscont[vc + 1]; cpos++)
152 cpwidth++; //used to normalise the distance score
155 * get set of sequences without gap in the current column
157 Set<SeqCigar> seqsWithoutGapAtCol = findSeqsWithoutGapAtColumn(seqs, cpos);
160 * count score for each dissimilar secondary structure annotation on i'th and j'th
161 * sequence. Igonre if similar and add this 'distance' measure to the total
162 * for [i, j] for j > i
164 for (int i = 0; i < (noseqs - 1); i++)
166 //Iterates for each sequences
167 for (int j = i + 1; j < noseqs; j++)
169 SeqCigar sc1 = seqs[i];
170 SeqCigar sc2 = seqs[j];
172 //check if ss is defined
173 boolean undefinedSS1 = seqsWithUndefinedSS.contains(sc1);
174 boolean undefinedSS2 = seqsWithUndefinedSS.contains(sc2);
176 // Set distance to 0 if both SS are not defined
177 if (undefinedSS1 && undefinedSS2) {
178 distances[i][j] += MIN_SCORE;
182 // Set distance to maximum score if either one SS is not defined
183 else if(undefinedSS1 || undefinedSS2) {
184 distances[i][j] += MAX_SCORE;
188 //check if the sequence contains gap in the current column
189 boolean gap1 = !seqsWithoutGapAtCol.contains(sc1);
190 boolean gap2 = !seqsWithoutGapAtCol.contains(sc2);
192 //Variable to store secondary structure at the current column
193 Set<String> secondaryStructure1 = new HashSet<String>();
194 Set<String> secondaryStructure2 = new HashSet<String>();
196 //secondary structure is fetched only if the current column is not
197 //gap for the sequence
198 if(!gap1 && !undefinedSS1) {
199 secondaryStructure1.addAll(
200 findSSAnnotationForGivenSeqAndCol(seqs[i], cpos));
203 if(!gap2 && !undefinedSS2) {
204 secondaryStructure2.addAll(
205 findSSAnnotationForGivenSeqAndCol(seqs[j], cpos));
209 * gap-gap always scores zero
210 * residue-residue is always scored
211 * include gap-residue score if params say to do so
213 if ((!gap1 && !gap2) || params.includeGaps())
215 int seqDistance = SetUtils.countDisjunction(
216 secondaryStructure1, secondaryStructure2);
217 distances[i][j] += seqDistance;
225 * normalise the distance scores (summed over columns) by the
226 * number of visible columns used in the calculation
227 * and fill in the bottom half of the matrix
229 // TODO JAL-2424 cpwidth may be out by 1 - affects scores but not tree shape
230 for (int i = 0; i < noseqs; i++)
232 for (int j = i + 1; j < noseqs; j++)
234 distances[i][j] /= cpwidth;
235 distances[j][i] = distances[i][j];
238 return new Matrix(distances);
242 * Builds and returns a set containing sequences (SeqCigar) which do not
243 * have a gap at the given column position.
246 * @param columnPosition
250 protected Set<SeqCigar> findSeqsWithoutGapAtColumn(
251 SeqCigar[] seqs, int columnPosition)
253 Set<SeqCigar> seqsWithoutGapAtCol = new HashSet<>();
254 for (SeqCigar seq : seqs)
256 int spos = seq.findPosition(columnPosition);
260 * position is not a gap
262 seqsWithoutGapAtCol.add(seq);
265 return seqsWithoutGapAtCol;
269 * Builds and returns a set containing sequences (SeqCigar) which have
270 * no secondary structures defined
276 protected Set<SeqCigar> findSeqsWithUndefinedSS(
279 Set<SeqCigar> seqsWithUndefinedSS = new HashSet<>();
280 for (SeqCigar seq : seqs)
283 AlignmentAnnotation[] aa = seq.getRefSeq().getAnnotation(SS_ANNOTATION_LABEL);
286 * secondary structure is undefined for the seq
289 seqsWithUndefinedSS.add(seq);
292 return seqsWithUndefinedSS;
296 * Finds secondary structure annotation for a given sequence (SeqCigar)
297 * and column position corresponding to the sequence.
300 * @param columnPosition
304 private Set<String> findSSAnnotationForGivenSeqAndCol(
305 SeqCigar seq, int columnPosition)
307 Set<String> secondaryStructure = new HashSet<String>();
311 //fetch the position in sequence for the column and finds the
312 //corresponding secondary structure annotation
313 int seqPosition = seq.findPosition(columnPosition);
314 AlignmentAnnotation[] aa = seq.getRefSeq().getAnnotation(SS_ANNOTATION_LABEL);
316 if (aa[0].getAnnotationForPosition(seqPosition) != null) {
317 Annotation a = aa[0].getAnnotationForPosition(seqPosition);
318 ss = a.secondaryStructure;
320 //There is no representation for coil and it can be either ' ' or null.
328 secondaryStructure.add(String.valueOf(ss));
331 return secondaryStructure;
336 public String getName()
342 public String getDescription()
348 public boolean isDNA()
354 public boolean isProtein()
360 public boolean isSecondaryStructure()
366 public String toString()
368 return "Score between sequences based on hamming distance between binary vectors marking secondary structure displayed at each column";