2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
21 package jalview.analysis.scoremodels;
23 import jalview.api.AlignmentViewPanel;
24 import jalview.api.FeatureRenderer;
25 import jalview.api.analysis.ScoreModelI;
26 import jalview.api.analysis.SimilarityParamsI;
27 import jalview.datamodel.AlignmentAnnotation;
28 import jalview.datamodel.AlignmentView;
29 import jalview.datamodel.Annotation;
30 import jalview.datamodel.SeqCigar;
31 import jalview.math.Matrix;
32 import jalview.math.MatrixI;
33 import jalview.util.SetUtils;
35 import java.util.HashSet;
38 /* This class contains methods to calculate distance score between
39 * secondary structure annotations of the sequences. The inverse of
40 * the score is later calculated for similarity score.
42 public class SecondaryStructureDistanceModel extends DistanceScoreModel
44 private static final String NAME = "Secondary Structure Similarity";
46 private String description;
53 public SecondaryStructureDistanceModel()
59 public ScoreModelI getInstance(AlignmentViewPanel view)
61 SecondaryStructureDistanceModel instance;
64 instance = this.getClass().getDeclaredConstructor().newInstance();
65 instance.configureFromAlignmentView(view);
67 } catch (InstantiationException | IllegalAccessException e)
69 jalview.bin.Console.errPrintln("Error in " + getClass().getName()
70 + ".getInstance(): " + e.getMessage());
72 } catch (ReflectiveOperationException roe)
78 boolean configureFromAlignmentView(AlignmentViewPanel view)
81 fr = view.cloneFeatureRenderer();
86 * Calculates a distance measure [i][j] between each pair of sequences as the
87 * average number of features they have but do not share. That is, find the
88 * features each sequence pair has at each column, ignore feature types they
89 * have in common, and count the rest. The totals are normalised by the number
90 * of columns processed.
92 * The parameters argument provides settings for treatment of gap-residue
93 * aligned positions, and whether the score is over the longer or shorter of
94 * each pair of sequences
101 * Calculates distance score [i][j] between each pair of protein sequences
102 * based on their secondary structure annotations (H, E, C). That is, find the
103 * secondary structures each sequence has at each column and scores positively for
104 * each non similar secondary structure annotations. Scores 0 for similar secondary
105 * structure annotations. The final score is normalized by the number of
106 * alignment columns processed, providing an average similarity score.
108 * The parameters argument can include settings for handling gap-residue aligned
109 * positions and may determine if the score calculation is based on the longer or shorter
110 * sequence in each pair. This can be important for handling partial alignments or
111 * sequences of significantly different lengths.
113 * @param seqData The aligned sequence data including secondary structure annotations.
114 * @param params Additional parameters for customizing the scoring process, such as gap
115 * handling and sequence length consideration.
118 public MatrixI findDistances(AlignmentView seqData,
119 SimilarityParamsI params)
121 SeqCigar[] seqs = seqData.getSequences();
122 int noseqs = seqs.length; //no of sequences
123 int cpwidth = 0; // = seqData.getWidth();
124 double[][] distances = new double[noseqs][noseqs]; //matrix to store distance score
126 // need to get real position for view position
127 int[] viscont = seqData.getVisibleContigs();
130 * scan each column, compute and add to each distance[i, j]
131 * the number of secondary structure annotation that seqi
132 * and seqj do not share
134 for (int vc = 0; vc < viscont.length; vc += 2)
136 //Iterates for each column position
137 for (int cpos = viscont[vc]; cpos <= viscont[vc + 1]; cpos++)
139 cpwidth++; //used to normalise the distance score
142 * get set of sequences without gap in the current column
144 Set<SeqCigar> seqsWithoutGapAtCol = findSeqsWithoutGapAtColumn(seqs, cpos);
147 * count score for each dissimilar secondary structure annotation on i'th and j'th
148 * sequence. Igonre if similar and add this 'distance' measure to the total
149 * for [i, j] for j > i
151 for (int i = 0; i < (noseqs - 1); i++)
153 //Iterates for each sequences
154 for (int j = i + 1; j < noseqs; j++)
156 SeqCigar sc1 = seqs[i];
157 SeqCigar sc2 = seqs[j];
158 boolean gap1 = !seqsWithoutGapAtCol.contains(sc1);
159 boolean gap2 = !seqsWithoutGapAtCol.contains(sc2);
161 //Variable to store secondary structure at the current column
162 Set<String> secondaryStructure1 = new HashSet<String>();
163 Set<String> secondaryStructure2 = new HashSet<String>();
165 //secondary structure is fetched only if the current column is not
166 //gap for the sequence
168 secondaryStructure1.addAll(
169 findSSAnnotationForGivenSeqAndCol(seqs[i], cpos));
173 secondaryStructure2.addAll(
174 findSSAnnotationForGivenSeqAndCol(seqs[j], cpos));
178 * gap-gap always scores zero
179 * residue-residue is always scored
180 * include gap-residue score if params say to do so
182 if ((!gap1 && !gap2) || params.includeGaps())
184 int seqDistance = SetUtils.countDisjunction(
185 secondaryStructure1, secondaryStructure2);
186 distances[i][j] += seqDistance;
194 * normalise the distance scores (summed over columns) by the
195 * number of visible columns used in the calculation
196 * and fill in the bottom half of the matrix
198 // TODO JAL-2424 cpwidth may be out by 1 - affects scores but not tree shape
199 for (int i = 0; i < noseqs; i++)
201 for (int j = i + 1; j < noseqs; j++)
203 distances[i][j] /= cpwidth;
204 distances[j][i] = distances[i][j];
207 return new Matrix(distances);
211 * Builds and returns a set containing sequences (SeqCigar) which do not
212 * have a gap at the given column position.
215 * @param columnPosition
219 protected Set<SeqCigar> findSeqsWithoutGapAtColumn(
220 SeqCigar[] seqs, int columnPosition)
222 Set<SeqCigar> seqsWithoutGapAtCol = new HashSet<>();
223 for (SeqCigar seq : seqs)
225 int spos = seq.findPosition(columnPosition);
229 * position is not a gap
231 seqsWithoutGapAtCol.add(seq);
234 return seqsWithoutGapAtCol;
238 * Finds secondary structure annotation for a given sequence (SeqCigar)
239 * and column position corresponding to the sequence.
242 * @param columnPosition
246 private Set<String> findSSAnnotationForGivenSeqAndCol(
247 SeqCigar seq, int columnPosition)
249 Set<String> secondaryStructure = new HashSet<String>();
251 char ss = '\0'; //default null character
253 //fetch the position in sequence for the column and finds the
254 //corresponding secondary structure annotation
255 int seqPosition = seq.findPosition(columnPosition);
256 AlignmentAnnotation[] aa = seq.getRefSeq().getAnnotation("Secondary Structure");
258 if (aa[0].getAnnotationForPosition(seqPosition) != null) {
259 Annotation a = aa[0].getAnnotationForPosition(seqPosition);
260 ss = a.secondaryStructure;
262 ss = 'C'; // In JalView, 'C' is represented as ' '
268 if (ss != '\0') { // Check if ss is not the default null character
269 secondaryStructure.add(String.valueOf(ss));
272 return secondaryStructure;
277 public String getName()
283 public String getDescription()
289 public boolean isDNA()
295 public boolean isProtein()
301 public boolean isSecondaryStructure()
307 public String toString()
309 return "Score between sequences based on hamming distance between binary vectors marking features displayed at each column";