2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
21 package jalview.analysis.scoremodels;
23 import jalview.api.AlignmentViewPanel;
24 import jalview.api.FeatureRenderer;
25 import jalview.api.analysis.ScoreModelI;
26 import jalview.api.analysis.SimilarityParamsI;
27 import jalview.datamodel.AlignmentAnnotation;
28 import jalview.datamodel.AlignmentView;
29 import jalview.datamodel.Annotation;
30 import jalview.datamodel.SeqCigar;
31 import jalview.math.Matrix;
32 import jalview.math.MatrixI;
33 import jalview.util.SetUtils;
35 import java.util.HashMap;
36 import java.util.HashSet;
40 /* This class contains methods to calculate distance score between
41 * secondary structure annotations of the sequences. The inverse of
42 * the score is later calculated for similarity score.
44 public class SecondaryStructureDistanceModel extends DistanceScoreModel
46 private static final String NAME = "Secondary Structure Similarity";
48 //label in secondary structure annotation data model from 3d structures
49 private static final String SS_ANNOTATION_LABEL = "Secondary Structure";
51 //label in secondary structure annotation data model from JPred
52 private static final String SS_ANNOTATION_FROM_JPRED_LABEL = "jnetpred";
54 //label in secondary structure annotation data model from JPred
55 private static final String JPRED_WEBSERVICE = "JPred";
57 private String description;
59 //maximum distance score is defined as 2 as the possible number of unique secondary structure is 2.
60 private static final int MAX_SCORE = 2;
62 //minimum distance score is defined as 2 as the possible number of unique secondary structure is 2.
63 private static final int MIN_SCORE = 0;
65 //character used to represent coil in secondary structure
66 private static final char COIL = 'C';
71 * Annotation label for available sources of secondary structure
73 private static final String[] SS_ANNOTATION_LABELS = {
75 SS_ANNOTATION_FROM_JPRED_LABEL
81 public SecondaryStructureDistanceModel()
87 public ScoreModelI getInstance(AlignmentViewPanel view)
89 SecondaryStructureDistanceModel instance;
92 instance = this.getClass().getDeclaredConstructor().newInstance();
93 instance.configureFromAlignmentView(view);
95 } catch (InstantiationException | IllegalAccessException e)
97 jalview.bin.Console.errPrintln("Error in " + getClass().getName()
98 + ".getInstance(): " + e.getMessage());
100 } catch (ReflectiveOperationException roe)
106 boolean configureFromAlignmentView(AlignmentViewPanel view)
109 fr = view.cloneFeatureRenderer();
114 * Calculates a distance measure [i][j] between each pair of sequences as the
115 * average number of features they have but do not share. That is, find the
116 * features each sequence pair has at each column, ignore feature types they
117 * have in common, and count the rest. The totals are normalised by the number
118 * of columns processed.
120 * The parameters argument provides settings for treatment of gap-residue
121 * aligned positions, and whether the score is over the longer or shorter of
122 * each pair of sequences
129 * Calculates distance score [i][j] between each pair of protein sequences
130 * based on their secondary structure annotations (H, E, C). That is, find the
131 * secondary structures each sequence has at each column and scores positively for
132 * each non similar secondary structure annotations. Scores 0 for similar secondary
133 * structure annotations. The final score is normalized by the number of
134 * alignment columns processed, providing an average similarity score.
136 * The parameters argument can include settings for handling gap-residue aligned
137 * positions and may determine if the score calculation is based on the longer or shorter
138 * sequence in each pair. This can be important for handling partial alignments or
139 * sequences of significantly different lengths.
141 * @param seqData The aligned sequence data including secondary structure annotations.
142 * @param params Additional parameters for customizing the scoring process, such as gap
143 * handling and sequence length consideration.
146 public MatrixI findDistances(AlignmentView seqData,
147 SimilarityParamsI params)
150 SeqCigar[] seqs = seqData.getSequences();
151 int noseqs = seqs.length; //no of sequences
152 int cpwidth = 0; // = seqData.getWidth();
153 double[][] distances = new double[noseqs][noseqs]; //matrix to store distance score
154 //secondary structure source parameter selected by the user from the drop down.
155 String ssSource = params.getSecondaryStructureSource();
157 //defining the default value for secondary structure source as 3d structures
158 //or JPred if user selected JPred
159 String selectedSSSource = SS_ANNOTATION_LABEL;
160 if(ssSource.equals(JPRED_WEBSERVICE))
161 selectedSSSource = SS_ANNOTATION_FROM_JPRED_LABEL;
163 // need to get real position for view position
164 int[] viscont = seqData.getVisibleContigs();
167 * Add secondary structure annotations that are added to the annotation track
170 Map<String, HashSet<String>> ssAlignmentAnnotationForSequences
171 = new HashMap<String,HashSet<String>>();
173 AlignmentAnnotation[] alignAnnotList = fr.getViewport().getAlignment()
174 .getAlignmentAnnotation();
176 if(alignAnnotList.length > 0) {
177 for (AlignmentAnnotation aa: alignAnnotList) {
178 if (selectedSSSource.equals(aa.label)) {
179 ssAlignmentAnnotationForSequences.computeIfAbsent(
180 aa.sequenceRef.getName(), k -> new HashSet<>())
181 .add(aa.description);
187 * Get the set of sequences which are not considered for the calculation.
188 * Following sequences are added:
189 * 1. Sequences without a defined secondary structure from the selected
191 * 2. Sequences whose secondary structure annotations are not added to
192 * the annotation track
194 Set<SeqCigar> seqsWithUndefinedSS
195 = findSeqsWithUndefinedSS(seqs, ssAlignmentAnnotationForSequences);
198 * scan each column, compute and add to each distance[i, j]
199 * the number of secondary structure annotation that seqi
200 * and seqj do not share
202 for (int vc = 0; vc < viscont.length; vc += 2)
204 //Iterates for each column position
205 for (int cpos = viscont[vc]; cpos <= viscont[vc + 1]; cpos++)
207 cpwidth++; //used to normalise the distance score
210 * get set of sequences without gap in the current column
212 Set<SeqCigar> seqsWithoutGapAtCol = findSeqsWithoutGapAtColumn(seqs, cpos);
215 * count score for each dissimilar secondary structure annotation on i'th and j'th
216 * sequence. Igonre if similar and add this 'distance' measure to the total
217 * for [i, j] for j > i
219 for (int i = 0; i < (noseqs - 1); i++)
221 //Iterates for each sequences
222 for (int j = i + 1; j < noseqs; j++)
224 SeqCigar sc1 = seqs[i];
225 SeqCigar sc2 = seqs[j];
228 //check if ss is defined
229 boolean undefinedSS1 = seqsWithUndefinedSS.contains(sc1);
230 boolean undefinedSS2 = seqsWithUndefinedSS.contains(sc2);
232 // Set distance to 0 if both SS are not defined
233 if (undefinedSS1 && undefinedSS2) {
234 distances[i][j] += MIN_SCORE;
238 // Set distance to maximum score if either one SS is not defined
239 else if(undefinedSS1 || undefinedSS2) {
240 distances[i][j] += MAX_SCORE;
244 //check if the sequence contains gap in the current column
245 boolean gap1 = !seqsWithoutGapAtCol.contains(sc1);
246 boolean gap2 = !seqsWithoutGapAtCol.contains(sc2);
248 //Variable to store secondary structure at the current column
249 Set<String> secondaryStructure1 = new HashSet<String>();
250 Set<String> secondaryStructure2 = new HashSet<String>();
252 //secondary structure is fetched only if the current column is not
253 //gap for the sequence
254 if(!gap1 && !undefinedSS1) {
255 secondaryStructure1.addAll(
256 findSSAnnotationForGivenSeqAndCol(seqs[i], cpos));
259 if(!gap2 && !undefinedSS2) {
260 secondaryStructure2.addAll(
261 findSSAnnotationForGivenSeqAndCol(seqs[j], cpos));
265 * gap-gap always scores zero
266 * ss-ss is always scored
267 * include gap-ss scores 1 if params say to do so
269 if ((!gap1 && !gap2) || params.includeGaps())
271 int seqDistance = SetUtils.countDisjunction(
272 secondaryStructure1, secondaryStructure2);
273 distances[i][j] += seqDistance;
281 * normalise the distance scores (summed over columns) by the
282 * number of visible columns used in the calculation
283 * and fill in the bottom half of the matrix
285 // TODO JAL-2424 cpwidth may be out by 1 - affects scores but not tree shape
286 for (int i = 0; i < noseqs; i++)
288 for (int j = i + 1; j < noseqs; j++)
290 distances[i][j] /= cpwidth;
291 distances[j][i] = distances[i][j];
294 return new Matrix(distances);
298 * Builds and returns a set containing sequences (SeqCigar) which do not
299 * have a gap at the given column position.
302 * @param columnPosition
306 protected Set<SeqCigar> findSeqsWithoutGapAtColumn(
307 SeqCigar[] seqs, int columnPosition)
309 Set<SeqCigar> seqsWithoutGapAtCol = new HashSet<>();
310 for (SeqCigar seq : seqs)
312 int spos = seq.findPosition(columnPosition);
316 * position is not a gap
318 seqsWithoutGapAtCol.add(seq);
321 return seqsWithoutGapAtCol;
326 * Builds and returns a set containing sequences (SeqCigar) which
327 * are not considered for the similarity calculation.
328 * Following sequences are added:
329 * 1. Sequences without a defined secondary structure from the selected
331 * 2. Sequences whose secondary structure annotations are not added to
332 * the annotation track
334 * @param ssAlignmentAnnotationForSequences
337 protected Set<SeqCigar> findSeqsWithUndefinedSS(SeqCigar[] seqs,
338 Map<String, HashSet<String>> ssAlignmentAnnotationForSequences) {
339 Set<SeqCigar> seqsWithUndefinedSS = new HashSet<>();
340 for (SeqCigar seq : seqs) {
341 if (isSSUndefinedOrNotAdded(seq, ssAlignmentAnnotationForSequences)) {
342 seqsWithUndefinedSS.add(seq);
345 return seqsWithUndefinedSS;
350 * Returns true if a sequence (SeqCigar) should not be
351 * considered for the similarity calculation.
352 * Following conditions are checked:
353 * 1. Sequence without a defined secondary structure from the selected
355 * 2. Sequences whose secondary structure annotations are not added to
356 * the annotation track
358 * @param ssAlignmentAnnotationForSequences
361 private boolean isSSUndefinedOrNotAdded(SeqCigar seq,
362 Map<String, HashSet<String>> ssAlignmentAnnotationForSequences) {
363 for (String label : SS_ANNOTATION_LABELS) {
364 AlignmentAnnotation[] annotations = seq.getRefSeq().getAnnotation(label);
365 if (annotations != null) {
366 for (AlignmentAnnotation annotation : annotations) {
367 HashSet<String> descriptionSet = ssAlignmentAnnotationForSequences
368 .get(annotation.sequenceRef.getName());
369 if (descriptionSet != null)
370 if (descriptionSet.contains(annotation.description)) {
371 // Secondary structure annotation is present and
372 //added to the track, no need to add seq
378 // Either annotations are undefined or not added to the track
384 * Finds secondary structure annotation for a given sequence (SeqCigar)
385 * and column position corresponding to the sequence.
388 * @param columnPosition
392 private Set<String> findSSAnnotationForGivenSeqAndCol(
393 SeqCigar seq, int columnPosition)
395 Set<String> secondaryStructure = new HashSet<String>();
399 //fetch the position in sequence for the column and finds the
400 //corresponding secondary structure annotation
401 //TO DO - consider based on priority
402 int seqPosition = seq.findPosition(columnPosition);
403 AlignmentAnnotation[] aa = seq.getRefSeq().getAnnotation(SS_ANNOTATION_LABEL);
406 aa = seq.getRefSeq().getAnnotation(SS_ANNOTATION_FROM_JPRED_LABEL);
410 if (aa[0].getAnnotationForPosition(seqPosition) != null) {
411 Annotation a = aa[0].getAnnotationForPosition(seqPosition);
412 ss = a.secondaryStructure;
414 //There is no representation for coil and it can be either ' ' or null.
415 if (ss == ' ' || ss == '-') {
422 secondaryStructure.add(String.valueOf(ss));
425 return secondaryStructure;
430 public String getName()
436 public String getDescription()
442 public boolean isDNA()
448 public boolean isProtein()
454 public boolean isSecondaryStructure()
460 public String toString()
462 return "Score between sequences based on hamming distance between binary "
463 + "vectors marking secondary structure displayed at each column";