/*
* Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
* Copyright (C) $$Year-Rel$$ The Jalview Authors
*
* This file is part of Jalview.
*
* Jalview is free software: you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation, either version 3
* of the License, or (at your option) any later version.
*
* Jalview is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty
* of MERCHANTABILITY or FITNESS FOR A PARTICULAR
* PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Jalview. If not, see .
* The Jalview Authors are detailed in the 'AUTHORS' file.
*/
package jalview.analysis.scoremodels;
import jalview.api.AlignmentViewPanel;
import jalview.api.FeatureRenderer;
import jalview.api.analysis.ScoreModelI;
import jalview.api.analysis.SimilarityParamsI;
import jalview.datamodel.AlignmentAnnotation;
import jalview.datamodel.AlignmentView;
import jalview.datamodel.Annotation;
import jalview.datamodel.SeqCigar;
import jalview.math.Matrix;
import jalview.math.MatrixI;
import jalview.util.SetUtils;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
/* This class contains methods to calculate distance score between
* secondary structure annotations of the sequences. The inverse of
* the score is later calculated for similarity score.
*/
public class SecondaryStructureDistanceModel extends DistanceScoreModel
{
private static final String NAME = "Secondary Structure Similarity";
private static final String SS_ANNOTATION_LABEL = "Secondary Structure";
private static final String SS_ANNOTATION_FROM_JPRED_LABEL = "jnetpred";
private String description;
//maximum distance score is defined as 2 as the possible number of unique ss is 2.
private static final int MAX_SCORE = 2;
//minimum distance score is defined as 2 as the possible number of unique ss is 2.
private static final int MIN_SCORE = 0;
private static final char COIL = 'C';
FeatureRenderer fr;
/**
* Constructor
*/
public SecondaryStructureDistanceModel()
{
}
@Override
public ScoreModelI getInstance(AlignmentViewPanel view)
{
SecondaryStructureDistanceModel instance;
try
{
instance = this.getClass().getDeclaredConstructor().newInstance();
instance.configureFromAlignmentView(view);
return instance;
} catch (InstantiationException | IllegalAccessException e)
{
jalview.bin.Console.errPrintln("Error in " + getClass().getName()
+ ".getInstance(): " + e.getMessage());
return null;
} catch (ReflectiveOperationException roe)
{
return null;
}
}
boolean configureFromAlignmentView(AlignmentViewPanel view)
{
fr = view.cloneFeatureRenderer();
return true;
}
/**
* Calculates a distance measure [i][j] between each pair of sequences as the
* average number of features they have but do not share. That is, find the
* features each sequence pair has at each column, ignore feature types they
* have in common, and count the rest. The totals are normalised by the number
* of columns processed.
*
* The parameters argument provides settings for treatment of gap-residue
* aligned positions, and whether the score is over the longer or shorter of
* each pair of sequences
*
* @param seqData
* @param params
*/
/**
* Calculates distance score [i][j] between each pair of protein sequences
* based on their secondary structure annotations (H, E, C). That is, find the
* secondary structures each sequence has at each column and scores positively for
* each non similar secondary structure annotations. Scores 0 for similar secondary
* structure annotations. The final score is normalized by the number of
* alignment columns processed, providing an average similarity score.
*
* The parameters argument can include settings for handling gap-residue aligned
* positions and may determine if the score calculation is based on the longer or shorter
* sequence in each pair. This can be important for handling partial alignments or
* sequences of significantly different lengths.
*
* @param seqData The aligned sequence data including secondary structure annotations.
* @param params Additional parameters for customizing the scoring process, such as gap
* handling and sequence length consideration.
*/
@Override
public MatrixI findDistances(AlignmentView seqData,
SimilarityParamsI params)
{
SeqCigar[] seqs = seqData.getSequences();
int noseqs = seqs.length; //no of sequences
int cpwidth = 0; // = seqData.getWidth();
double[][] distances = new double[noseqs][noseqs]; //matrix to store distance score
// need to get real position for view position
int[] viscont = seqData.getVisibleContigs();
Map> calcIdMapInAlignmentAnnotation = new HashMap>();
AlignmentAnnotation[] alignAnnotList = fr.getViewport().getAlignment().getAlignmentAnnotation();
if(alignAnnotList.length > 0) {
for (AlignmentAnnotation aa: alignAnnotList) {
if (SS_ANNOTATION_LABEL.equals(aa.label) || SS_ANNOTATION_FROM_JPRED_LABEL.equals(aa.label)) {
calcIdMapInAlignmentAnnotation.computeIfAbsent(aa.getCalcId(), k -> new HashSet<>()).add(aa.description);
}
}
}
Set seqsWithUndefinedSS = findSeqsWithUndefinedSS(seqs, calcIdMapInAlignmentAnnotation);
/*
* scan each column, compute and add to each distance[i, j]
* the number of secondary structure annotation that seqi
* and seqj do not share
*/
for (int vc = 0; vc < viscont.length; vc += 2)
{
//Iterates for each column position
for (int cpos = viscont[vc]; cpos <= viscont[vc + 1]; cpos++)
{
cpwidth++; //used to normalise the distance score
/*
* get set of sequences without gap in the current column
*/
Set seqsWithoutGapAtCol = findSeqsWithoutGapAtColumn(seqs, cpos);
/*
* count score for each dissimilar secondary structure annotation on i'th and j'th
* sequence. Igonre if similar and add this 'distance' measure to the total
* for [i, j] for j > i
*/
for (int i = 0; i < (noseqs - 1); i++)
{
//Iterates for each sequences
for (int j = i + 1; j < noseqs; j++)
{
SeqCigar sc1 = seqs[i];
SeqCigar sc2 = seqs[j];
//check if ss is defined
boolean undefinedSS1 = seqsWithUndefinedSS.contains(sc1);
boolean undefinedSS2 = seqsWithUndefinedSS.contains(sc2);
// Set distance to 0 if both SS are not defined
if (undefinedSS1 && undefinedSS2) {
distances[i][j] += MIN_SCORE;
continue;
}
// Set distance to maximum score if either one SS is not defined
else if(undefinedSS1 || undefinedSS2) {
distances[i][j] += MAX_SCORE;
continue;
}
//check if the sequence contains gap in the current column
boolean gap1 = !seqsWithoutGapAtCol.contains(sc1);
boolean gap2 = !seqsWithoutGapAtCol.contains(sc2);
//Variable to store secondary structure at the current column
Set secondaryStructure1 = new HashSet();
Set secondaryStructure2 = new HashSet();
//secondary structure is fetched only if the current column is not
//gap for the sequence
if(!gap1 && !undefinedSS1) {
secondaryStructure1.addAll(
findSSAnnotationForGivenSeqAndCol(seqs[i], cpos));
}
if(!gap2 && !undefinedSS2) {
secondaryStructure2.addAll(
findSSAnnotationForGivenSeqAndCol(seqs[j], cpos));
}
/*
* gap-gap always scores zero
* ss-ss is always scored
* include gap-ss scores 1 if params say to do so
*/
if ((!gap1 && !gap2) || params.includeGaps())
{
int seqDistance = SetUtils.countDisjunction(
secondaryStructure1, secondaryStructure2);
distances[i][j] += seqDistance;
}
}
}
}
}
/*
* normalise the distance scores (summed over columns) by the
* number of visible columns used in the calculation
* and fill in the bottom half of the matrix
*/
// TODO JAL-2424 cpwidth may be out by 1 - affects scores but not tree shape
for (int i = 0; i < noseqs; i++)
{
for (int j = i + 1; j < noseqs; j++)
{
distances[i][j] /= cpwidth;
distances[j][i] = distances[i][j];
}
}
return new Matrix(distances);
}
/**
* Builds and returns a set containing sequences (SeqCigar) which do not
* have a gap at the given column position.
*
* @param seqs
* @param columnPosition
* (0..)
* @return
*/
protected Set findSeqsWithoutGapAtColumn(
SeqCigar[] seqs, int columnPosition)
{
Set seqsWithoutGapAtCol = new HashSet<>();
for (SeqCigar seq : seqs)
{
int spos = seq.findPosition(columnPosition);
if (spos != -1)
{
/*
* position is not a gap
*/
seqsWithoutGapAtCol.add(seq);
}
}
return seqsWithoutGapAtCol;
}
/**
* Builds and returns a set containing sequences (SeqCigar) which have
* no secondary structures defined
*
* @param seqs
* (0..)
* @return
*/
private static final String[] SS_ANNOTATION_LABELS = {
SS_ANNOTATION_LABEL,
SS_ANNOTATION_FROM_JPRED_LABEL
};
protected Set findSeqsWithUndefinedSS(SeqCigar[] seqs, Map> calcIdMapInAlignmentAnnotation) {
Set seqsWithUndefinedSS = new HashSet<>();
for (SeqCigar seq : seqs) {
if (isSSUndefinedOrNotAdded(seq, calcIdMapInAlignmentAnnotation)) {
seqsWithUndefinedSS.add(seq);
}
}
return seqsWithUndefinedSS;
}
private boolean isSSUndefinedOrNotAdded(SeqCigar seq, Map> calcIdMapInAlignmentAnnotation) {
for (String label : SS_ANNOTATION_LABELS) {
AlignmentAnnotation[] annotations = seq.getRefSeq().getAnnotation(label);
if (annotations != null) {
for (AlignmentAnnotation annotation : annotations) {
HashSet descriptionList = calcIdMapInAlignmentAnnotation.get(annotation.getCalcId());
if (descriptionList.contains(annotation.description)) {
// Secondary structure annotation is present and added to the track, no need to add seq
return false;
}
}
}
}
// Either annotations are undefined or not added to the track
return true;
}
/**
* Finds secondary structure annotation for a given sequence (SeqCigar)
* and column position corresponding to the sequence.
*
* @param seq
* @param columnPosition
* (0..)
* @return
*/
private Set findSSAnnotationForGivenSeqAndCol(
SeqCigar seq, int columnPosition)
{
Set secondaryStructure = new HashSet();
char ss;
//fetch the position in sequence for the column and finds the
//corresponding secondary structure annotation
//TO DO - consider based on priority
int seqPosition = seq.findPosition(columnPosition);
AlignmentAnnotation[] aa = seq.getRefSeq().getAnnotation(SS_ANNOTATION_LABEL);
if(aa == null) {
aa = seq.getRefSeq().getAnnotation(SS_ANNOTATION_FROM_JPRED_LABEL);
}
if (aa != null) {
if (aa[0].getAnnotationForPosition(seqPosition) != null) {
Annotation a = aa[0].getAnnotationForPosition(seqPosition);
ss = a.secondaryStructure;
//There is no representation for coil and it can be either ' ' or null.
if (ss == ' ' || ss == '-') {
ss = COIL;
}
}
else {
ss = COIL;
}
secondaryStructure.add(String.valueOf(ss));
}
return secondaryStructure;
}
@Override
public String getName()
{
return NAME;
}
@Override
public String getDescription()
{
return description;
}
@Override
public boolean isDNA()
{
return false;
}
@Override
public boolean isProtein()
{
return false;
}
@Override
public boolean isSecondaryStructure()
{
return true;
}
@Override
public String toString()
{
return "Score between sequences based on hamming distance between binary vectors marking secondary structure displayed at each column";
}
}