/* * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$) * Copyright (C) $$Year-Rel$$ The Jalview Authors * * This file is part of Jalview. * * Jalview is free software: you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation, either version 3 * of the License, or (at your option) any later version. * * Jalview is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty * of MERCHANTABILITY or FITNESS FOR A PARTICULAR * PURPOSE. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Jalview. If not, see . * The Jalview Authors are detailed in the 'AUTHORS' file. */ package jalview.analysis.scoremodels; import jalview.api.AlignmentViewPanel; import jalview.api.FeatureRenderer; import jalview.api.analysis.ScoreModelI; import jalview.api.analysis.SimilarityParamsI; import jalview.datamodel.AlignmentAnnotation; import jalview.datamodel.AlignmentView; import jalview.datamodel.Annotation; import jalview.datamodel.SeqCigar; import jalview.math.Matrix; import jalview.math.MatrixI; import jalview.util.SetUtils; import java.util.HashMap; import java.util.HashSet; import java.util.Map; import java.util.Set; /* This class contains methods to calculate distance score between * secondary structure annotations of the sequences. The inverse of * the score is later calculated for similarity score. */ public class SecondaryStructureDistanceModel extends DistanceScoreModel { private static final String NAME = "Secondary Structure Similarity"; private static final String SS_ANNOTATION_LABEL = "Secondary Structure"; private static final String SS_ANNOTATION_FROM_JPRED_LABEL = "jnetpred"; private String description; //maximum distance score is defined as 2 as the possible number of unique ss is 2. private static final int MAX_SCORE = 2; //minimum distance score is defined as 2 as the possible number of unique ss is 2. private static final int MIN_SCORE = 0; private static final char COIL = 'C'; FeatureRenderer fr; /** * Constructor */ public SecondaryStructureDistanceModel() { } @Override public ScoreModelI getInstance(AlignmentViewPanel view) { SecondaryStructureDistanceModel instance; try { instance = this.getClass().getDeclaredConstructor().newInstance(); instance.configureFromAlignmentView(view); return instance; } catch (InstantiationException | IllegalAccessException e) { jalview.bin.Console.errPrintln("Error in " + getClass().getName() + ".getInstance(): " + e.getMessage()); return null; } catch (ReflectiveOperationException roe) { return null; } } boolean configureFromAlignmentView(AlignmentViewPanel view) { fr = view.cloneFeatureRenderer(); return true; } /** * Calculates a distance measure [i][j] between each pair of sequences as the * average number of features they have but do not share. That is, find the * features each sequence pair has at each column, ignore feature types they * have in common, and count the rest. The totals are normalised by the number * of columns processed. *

* The parameters argument provides settings for treatment of gap-residue * aligned positions, and whether the score is over the longer or shorter of * each pair of sequences * * @param seqData * @param params */ /** * Calculates distance score [i][j] between each pair of protein sequences * based on their secondary structure annotations (H, E, C). That is, find the * secondary structures each sequence has at each column and scores positively for * each non similar secondary structure annotations. Scores 0 for similar secondary * structure annotations. The final score is normalized by the number of * alignment columns processed, providing an average similarity score. *

* The parameters argument can include settings for handling gap-residue aligned * positions and may determine if the score calculation is based on the longer or shorter * sequence in each pair. This can be important for handling partial alignments or * sequences of significantly different lengths. * * @param seqData The aligned sequence data including secondary structure annotations. * @param params Additional parameters for customizing the scoring process, such as gap * handling and sequence length consideration. */ @Override public MatrixI findDistances(AlignmentView seqData, SimilarityParamsI params) { SeqCigar[] seqs = seqData.getSequences(); int noseqs = seqs.length; //no of sequences int cpwidth = 0; // = seqData.getWidth(); double[][] distances = new double[noseqs][noseqs]; //matrix to store distance score // need to get real position for view position int[] viscont = seqData.getVisibleContigs(); Map> calcIdMapInAlignmentAnnotation = new HashMap>(); AlignmentAnnotation[] alignAnnotList = fr.getViewport().getAlignment().getAlignmentAnnotation(); if(alignAnnotList.length > 0) { for (AlignmentAnnotation aa: alignAnnotList) { if (SS_ANNOTATION_LABEL.equals(aa.label) || SS_ANNOTATION_FROM_JPRED_LABEL.equals(aa.label)) { calcIdMapInAlignmentAnnotation.computeIfAbsent(aa.getCalcId(), k -> new HashSet<>()).add(aa.description); } } } Set seqsWithUndefinedSS = findSeqsWithUndefinedSS(seqs, calcIdMapInAlignmentAnnotation); /* * scan each column, compute and add to each distance[i, j] * the number of secondary structure annotation that seqi * and seqj do not share */ for (int vc = 0; vc < viscont.length; vc += 2) { //Iterates for each column position for (int cpos = viscont[vc]; cpos <= viscont[vc + 1]; cpos++) { cpwidth++; //used to normalise the distance score /* * get set of sequences without gap in the current column */ Set seqsWithoutGapAtCol = findSeqsWithoutGapAtColumn(seqs, cpos); /* * count score for each dissimilar secondary structure annotation on i'th and j'th * sequence. Igonre if similar and add this 'distance' measure to the total * for [i, j] for j > i */ for (int i = 0; i < (noseqs - 1); i++) { //Iterates for each sequences for (int j = i + 1; j < noseqs; j++) { SeqCigar sc1 = seqs[i]; SeqCigar sc2 = seqs[j]; //check if ss is defined boolean undefinedSS1 = seqsWithUndefinedSS.contains(sc1); boolean undefinedSS2 = seqsWithUndefinedSS.contains(sc2); // Set distance to 0 if both SS are not defined if (undefinedSS1 && undefinedSS2) { distances[i][j] += MIN_SCORE; continue; } // Set distance to maximum score if either one SS is not defined else if(undefinedSS1 || undefinedSS2) { distances[i][j] += MAX_SCORE; continue; } //check if the sequence contains gap in the current column boolean gap1 = !seqsWithoutGapAtCol.contains(sc1); boolean gap2 = !seqsWithoutGapAtCol.contains(sc2); //Variable to store secondary structure at the current column Set secondaryStructure1 = new HashSet(); Set secondaryStructure2 = new HashSet(); //secondary structure is fetched only if the current column is not //gap for the sequence if(!gap1 && !undefinedSS1) { secondaryStructure1.addAll( findSSAnnotationForGivenSeqAndCol(seqs[i], cpos)); } if(!gap2 && !undefinedSS2) { secondaryStructure2.addAll( findSSAnnotationForGivenSeqAndCol(seqs[j], cpos)); } /* * gap-gap always scores zero * ss-ss is always scored * include gap-ss scores 1 if params say to do so */ if ((!gap1 && !gap2) || params.includeGaps()) { int seqDistance = SetUtils.countDisjunction( secondaryStructure1, secondaryStructure2); distances[i][j] += seqDistance; } } } } } /* * normalise the distance scores (summed over columns) by the * number of visible columns used in the calculation * and fill in the bottom half of the matrix */ // TODO JAL-2424 cpwidth may be out by 1 - affects scores but not tree shape for (int i = 0; i < noseqs; i++) { for (int j = i + 1; j < noseqs; j++) { distances[i][j] /= cpwidth; distances[j][i] = distances[i][j]; } } return new Matrix(distances); } /** * Builds and returns a set containing sequences (SeqCigar) which do not * have a gap at the given column position. * * @param seqs * @param columnPosition * (0..) * @return */ protected Set findSeqsWithoutGapAtColumn( SeqCigar[] seqs, int columnPosition) { Set seqsWithoutGapAtCol = new HashSet<>(); for (SeqCigar seq : seqs) { int spos = seq.findPosition(columnPosition); if (spos != -1) { /* * position is not a gap */ seqsWithoutGapAtCol.add(seq); } } return seqsWithoutGapAtCol; } /** * Builds and returns a set containing sequences (SeqCigar) which have * no secondary structures defined * * @param seqs * (0..) * @return */ private static final String[] SS_ANNOTATION_LABELS = { SS_ANNOTATION_LABEL, SS_ANNOTATION_FROM_JPRED_LABEL }; protected Set findSeqsWithUndefinedSS(SeqCigar[] seqs, Map> calcIdMapInAlignmentAnnotation) { Set seqsWithUndefinedSS = new HashSet<>(); for (SeqCigar seq : seqs) { if (isSSUndefinedOrNotAdded(seq, calcIdMapInAlignmentAnnotation)) { seqsWithUndefinedSS.add(seq); } } return seqsWithUndefinedSS; } private boolean isSSUndefinedOrNotAdded(SeqCigar seq, Map> calcIdMapInAlignmentAnnotation) { for (String label : SS_ANNOTATION_LABELS) { AlignmentAnnotation[] annotations = seq.getRefSeq().getAnnotation(label); if (annotations != null) { for (AlignmentAnnotation annotation : annotations) { HashSet descriptionList = calcIdMapInAlignmentAnnotation.get(annotation.getCalcId()); if (descriptionList.contains(annotation.description)) { // Secondary structure annotation is present and added to the track, no need to add seq return false; } } } } // Either annotations are undefined or not added to the track return true; } /** * Finds secondary structure annotation for a given sequence (SeqCigar) * and column position corresponding to the sequence. * * @param seq * @param columnPosition * (0..) * @return */ private Set findSSAnnotationForGivenSeqAndCol( SeqCigar seq, int columnPosition) { Set secondaryStructure = new HashSet(); char ss; //fetch the position in sequence for the column and finds the //corresponding secondary structure annotation //TO DO - consider based on priority int seqPosition = seq.findPosition(columnPosition); AlignmentAnnotation[] aa = seq.getRefSeq().getAnnotation(SS_ANNOTATION_LABEL); if(aa == null) { aa = seq.getRefSeq().getAnnotation(SS_ANNOTATION_FROM_JPRED_LABEL); } if (aa != null) { if (aa[0].getAnnotationForPosition(seqPosition) != null) { Annotation a = aa[0].getAnnotationForPosition(seqPosition); ss = a.secondaryStructure; //There is no representation for coil and it can be either ' ' or null. if (ss == ' ' || ss == '-') { ss = COIL; } } else { ss = COIL; } secondaryStructure.add(String.valueOf(ss)); } return secondaryStructure; } @Override public String getName() { return NAME; } @Override public String getDescription() { return description; } @Override public boolean isDNA() { return false; } @Override public boolean isProtein() { return false; } @Override public boolean isSecondaryStructure() { return true; } @Override public String toString() { return "Score between sequences based on hamming distance between binary vectors marking secondary structure displayed at each column"; } }