*/
package jalview.analysis.scoremodels;
+import jalview.analysis.AlignmentUtils;
import jalview.api.AlignmentViewPanel;
import jalview.api.FeatureRenderer;
import jalview.api.analysis.ScoreModelI;
import jalview.datamodel.SeqCigar;
import jalview.math.Matrix;
import jalview.math.MatrixI;
+import jalview.util.Constants;
import jalview.util.SetUtils;
import java.util.HashMap;
import java.util.Set;
/* This class contains methods to calculate distance score between
- * secondary structure annotations of the sequences. The inverse of
- * the score is later calculated for similarity score.
+ * secondary structure annotations of the sequences.
*/
public class SecondaryStructureDistanceModel extends DistanceScoreModel
{
private static final String NAME = "Secondary Structure Similarity";
-
- //label in secondary structure annotation data model from 3d structures
- private static final String SS_ANNOTATION_LABEL = "Secondary Structure";
-
- //label in secondary structure annotation data model from JPred
- private static final String SS_ANNOTATION_FROM_JPRED_LABEL = "jnetpred";
- //label in secondary structure annotation data model from JPred
- private static final String JPRED_WEBSERVICE = "JPred";
-
- private String description;
-
- //maximum distance score is defined as 2 as the possible number of unique secondary structure is 2.
- private static final int MAX_SCORE = 2;
+ private ScoreMatrix ssRateMatrix;
- //minimum distance score is defined as 2 as the possible number of unique secondary structure is 2.
- private static final int MIN_SCORE = 0;
-
- //character used to represent coil in secondary structure
- private static final char COIL = 'C';
+ private String description;
FeatureRenderer fr;
- /*
- * Annotation label for available sources of secondary structure
- */
- private static final String[] SS_ANNOTATION_LABELS = {
- SS_ANNOTATION_LABEL,
- SS_ANNOTATION_FROM_JPRED_LABEL
- };
-
+
/**
* Constructor
*/
public SecondaryStructureDistanceModel()
{
-
+
}
-
+
@Override
public ScoreModelI getInstance(AlignmentViewPanel view)
{
fr = view.cloneFeatureRenderer();
return true;
}
-
- /**
- * Calculates a distance measure [i][j] between each pair of sequences as the
- * average number of features they have but do not share. That is, find the
- * features each sequence pair has at each column, ignore feature types they
- * have in common, and count the rest. The totals are normalised by the number
- * of columns processed.
- * <p>
- * The parameters argument provides settings for treatment of gap-residue
- * aligned positions, and whether the score is over the longer or shorter of
- * each pair of sequences
- *
- * @param seqData
- * @param params
- */
/**
* Calculates distance score [i][j] between each pair of protein sequences
- * based on their secondary structure annotations (H, E, C). That is, find the
- * secondary structures each sequence has at each column and scores positively for
- * each non similar secondary structure annotations. Scores 0 for similar secondary
- * structure annotations. The final score is normalized by the number of
+ * based on their secondary structure annotations (H, E, C).
+ * The final score is normalised by the number of
* alignment columns processed, providing an average similarity score.
* <p>
* The parameters argument can include settings for handling gap-residue aligned
* sequences of significantly different lengths.
*
* @param seqData The aligned sequence data including secondary structure annotations.
- * @param params Additional parameters for customizing the scoring process, such as gap
+ * @param params Additional parameters for customising the scoring process, such as gap
* handling and sequence length consideration.
*/
@Override
SeqCigar[] seqs = seqData.getSequences();
int noseqs = seqs.length; //no of sequences
int cpwidth = 0; // = seqData.getWidth();
- double[][] distances = new double[noseqs][noseqs]; //matrix to store distance score
- double[][] substitutionMatrix = getSubstitutionMatrix();
+ double[][] similarities = new double[noseqs][noseqs]; //matrix to store similarity score
//secondary structure source parameter selected by the user from the drop down.
String ssSource = params.getSecondaryStructureSource();
+ ssRateMatrix = ScoreModels.getInstance().getSecondaryStructureMatrix();
//defining the default value for secondary structure source as 3d structures
//or JPred if user selected JPred
- String selectedSSSource = SS_ANNOTATION_LABEL;
- if(ssSource.equals(JPRED_WEBSERVICE))
- selectedSSSource = SS_ANNOTATION_FROM_JPRED_LABEL;
+ String selectedSSSource = Constants.SS_ANNOTATION_LABEL;
+ if(ssSource.equals(Constants.SECONDARY_STRUCTURE_LABELS.get(Constants.SS_ANNOTATION_FROM_JPRED_LABEL)))
+ {
+ selectedSSSource = Constants.SS_ANNOTATION_FROM_JPRED_LABEL;
+ }
// need to get real position for view position
int[] viscont = seqData.getVisibleContigs();
= findSeqsWithUndefinedSS(seqs, ssAlignmentAnnotationForSequences);
/*
- * scan each column, compute and add to each distance[i, j]
+ * scan each column, compute and add to each similarity[i, j]
* the number of secondary structure annotation that seqi
* and seqj do not share
*/
//Iterates for each column position
for (int cpos = viscont[vc]; cpos <= viscont[vc + 1]; cpos++)
{
- cpwidth++; //used to normalise the distance score
+ cpwidth++; //used to normalise the similarity score
/*
* get set of sequences without gap in the current column
Set<SeqCigar> seqsWithoutGapAtCol = findSeqsWithoutGapAtColumn(seqs, cpos);
/*
- * count score for each dissimilar secondary structure annotation on i'th and j'th
- * sequence. Igonre if similar and add this 'distance' measure to the total
+ * calculate similarity score for each secondary structure annotation on i'th and j'th
+ * sequence and add this measure to the similarities matrix
* for [i, j] for j > i
*/
for (int i = 0; i < (noseqs - 1); i++)
boolean undefinedSS1 = seqsWithUndefinedSS.contains(sc1);
boolean undefinedSS2 = seqsWithUndefinedSS.contains(sc2);
- // Set distance to 0 if both SS are not defined
+ // Set similarity to max score if both SS are not defined
if (undefinedSS1 && undefinedSS2) {
- distances[i][j] += MIN_SCORE;
+ similarities[i][j] += ssRateMatrix.getMaximumScore();
continue;
}
- // Set distance to maximum score if either one SS is not defined
+ // Set similarity to minimum score if either one SS is not defined
else if(undefinedSS1 || undefinedSS2) {
- distances[i][j] += MAX_SCORE;
+ similarities[i][j] += ssRateMatrix.getMinimumScore();
continue;
}
boolean gap2 = !seqsWithoutGapAtCol.contains(sc2);
//Variable to store secondary structure at the current column
- char ss1 = 'G', ss2 = 'G';
+ char ss1 = '*';
+ char ss2 = '*';
//secondary structure is fetched only if the current column is not
//gap for the sequence
- if(!gap1 && !undefinedSS1) {
+ if(!gap1 && !undefinedSS1) {
+ //fetch the position in sequence for the column and finds the
+ //corresponding secondary structure annotation
+ //TO DO - consider based on priority
+ int seqPosition = seqs[i].findPosition(cpos);
+ AlignmentAnnotation[] aa = seqs[i].getRefSeq().getAnnotation(selectedSSSource);
ss1 =
- findSSAnnotationForGivenSeqAndCol(seqs[i], cpos);
+ AlignmentUtils.findSSAnnotationForGivenSeqposition(aa, seqPosition);
}
if(!gap2 && !undefinedSS2) {
- ss2 =
- findSSAnnotationForGivenSeqAndCol(seqs[j], cpos);
+ int seqPosition = seqs[j].findPosition(cpos);
+ AlignmentAnnotation[] aa = seqs[j].getRefSeq().getAnnotation(selectedSSSource);
+ ss2 =
+ AlignmentUtils.findSSAnnotationForGivenSeqposition(aa, seqPosition);
}
- /*
- * gap-gap scores zero
- * similar ss-ss scores zero
- * different ss-ss scores 1
- * gap-ss scores 1 if params say to do so
- */
if ((!gap1 && !gap2) || params.includeGaps())
{
- // Calculate distance score based on the substitution matrix
- double seqDistance = substitutionMatrix[getSubstitutionMatrixIndex(ss1)][getSubstitutionMatrixIndex(ss2)];
- distances[i][j] += seqDistance;
+ // Calculate similarity score based on the substitution matrix
+ double similarityScore = ssRateMatrix.getPairwiseScore(ss1, ss2);
+ similarities[i][j] += similarityScore;
}
}
}
}
/*
- * normalise the distance scores (summed over columns) by the
+ * normalise the similarity scores (summed over columns) by the
* number of visible columns used in the calculation
* and fill in the bottom half of the matrix
*/
// TODO JAL-2424 cpwidth may be out by 1 - affects scores but not tree shape
+
for (int i = 0; i < noseqs; i++)
{
for (int j = i + 1; j < noseqs; j++)
- {
- distances[i][j] /= cpwidth;
- distances[j][i] = distances[i][j];
+ {
+ similarities[i][j] /= cpwidth;
+ similarities[j][i] = similarities[i][j];
}
}
- return new Matrix(distances);
+ return ssRateMatrix.similarityToDistance(new Matrix(similarities));
+
}
/**
* (0..)
* @return
*/
- protected Set<SeqCigar> findSeqsWithoutGapAtColumn(
+ private Set<SeqCigar> findSeqsWithoutGapAtColumn(
SeqCigar[] seqs, int columnPosition)
{
Set<SeqCigar> seqsWithoutGapAtCol = new HashSet<>();
* @param ssAlignmentAnnotationForSequences
* @return
*/
- protected Set<SeqCigar> findSeqsWithUndefinedSS(SeqCigar[] seqs,
+ private Set<SeqCigar> findSeqsWithUndefinedSS(SeqCigar[] seqs,
Map<String, HashSet<String>> ssAlignmentAnnotationForSequences) {
Set<SeqCigar> seqsWithUndefinedSS = new HashSet<>();
for (SeqCigar seq : seqs) {
*/
private boolean isSSUndefinedOrNotAdded(SeqCigar seq,
Map<String, HashSet<String>> ssAlignmentAnnotationForSequences) {
- for (String label : SS_ANNOTATION_LABELS) {
+ for (String label : Constants.SECONDARY_STRUCTURE_LABELS.keySet()) {
AlignmentAnnotation[] annotations = seq.getRefSeq().getAnnotation(label);
if (annotations != null) {
for (AlignmentAnnotation annotation : annotations) {
HashSet<String> descriptionSet = ssAlignmentAnnotationForSequences
.get(annotation.sequenceRef.getName());
if (descriptionSet != null)
+ {
if (descriptionSet.contains(annotation.description)) {
// Secondary structure annotation is present and
//added to the track, no need to add seq
return false;
}
+ }
}
}
}
// Either annotations are undefined or not added to the track
return true;
}
-
-
- /**
- * Finds secondary structure annotation for a given sequence (SeqCigar)
- * and column position corresponding to the sequence.
- *
- * @param seq
- * @param columnPosition
- * (0..)
- * @return
- */
- private char findSSAnnotationForGivenSeqAndCol(
- SeqCigar seq, int columnPosition)
- {
- char ss = 'G';
-
- //fetch the position in sequence for the column and finds the
- //corresponding secondary structure annotation
- //TO DO - consider based on priority
- int seqPosition = seq.findPosition(columnPosition);
- AlignmentAnnotation[] aa = seq.getRefSeq().getAnnotation(SS_ANNOTATION_LABEL);
-
- if(aa == null) {
- aa = seq.getRefSeq().getAnnotation(SS_ANNOTATION_FROM_JPRED_LABEL);
- }
-
- if (aa != null) {
- if (aa[0].getAnnotationForPosition(seqPosition) != null) {
- Annotation a = aa[0].getAnnotationForPosition(seqPosition);
- ss = a.secondaryStructure;
-
- //There is no representation for coil and it can be either ' ' or null.
- if (ss == ' ' || ss == '-') {
- ss = COIL;
- }
- }
- else {
- ss = COIL;
- }
-
- }
-
- return ss;
- }
-
- /**
- * Retrieve the substitution matrix.
- *
- * @return The substitution matrix.
- */
- private double[][] getSubstitutionMatrix() {
- // Defining the substitution matrix
- // This matrix map distance scores between secondary structure symbols
- return new double[][]{
- // C E H G
- {0.0, 1.0, 1.0, 1.0}, // C - COIL
- {1.0, 0.0, 1.0, 1.0}, // E - SHEET
- {1.0, 1.0, 0.0, 1.0}, // H - HELIX
- {1.0, 1.0, 1.0, 0.0} // G - GAP
-
- };
- }
-
- private int getSubstitutionMatrixIndex(char ss) {
- switch (ss) {
- case 'C':
- return 0;
- case 'E':
- return 1;
- case 'H':
- return 2;
- case 'G':
- return 3;
- default:
- throw new IllegalArgumentException("Invalid secondary structure character: " + ss);
- }
- }
-
@Override
public String getName()
{
@Override
public String toString()
{
- return "Score between sequences based on hamming distance between binary "
+ return "Score between sequences based on similarity between binary "
+ "vectors marking secondary structure displayed at each column";
}
}
\ No newline at end of file