From 9a6ef22e716b40bca80a69b042705f80b8b612b7 Mon Sep 17 00:00:00 2001 From: Jim Procter Date: Thu, 27 Jun 2024 17:36:47 +0100 Subject: [PATCH] JAL-4386 allow score models to generate additional labelled entities to cluster - and do so for sequences with multiple secondary structure lines --- src/jalview/analysis/TreeBuilder.java | 18 ++- .../SecondaryStructureDistanceModel.java | 170 +++++++++++++------- src/jalview/api/analysis/ScoreModelI.java | 14 ++ 3 files changed, 146 insertions(+), 56 deletions(-) diff --git a/src/jalview/analysis/TreeBuilder.java b/src/jalview/analysis/TreeBuilder.java index 61f65ff..560a0a4 100644 --- a/src/jalview/analysis/TreeBuilder.java +++ b/src/jalview/analysis/TreeBuilder.java @@ -28,9 +28,12 @@ import jalview.datamodel.CigarArray; import jalview.datamodel.SeqCigar; import jalview.datamodel.SequenceI; import jalview.datamodel.SequenceNode; +import jalview.util.MessageManager; import jalview.viewmodel.AlignmentViewport; +import java.util.ArrayList; import java.util.BitSet; +import java.util.List; import java.util.Vector; public abstract class TreeBuilder extends TreeEngine @@ -40,6 +43,7 @@ public abstract class TreeBuilder extends TreeEngine public static final String NEIGHBOUR_JOINING = "NJ"; protected SequenceI[] sequences; + protected List labels; public AlignmentView seqData; @@ -74,7 +78,7 @@ public abstract class TreeBuilder extends TreeEngine } init(seqStrings, start, end); - + computeTree(sm, scoreParameters); } @@ -83,6 +87,7 @@ public abstract class TreeBuilder extends TreeEngine return sequences; } + /** * * @return true if tree has real distances @@ -121,6 +126,10 @@ public abstract class TreeBuilder extends TreeEngine */ protected void computeTree(ScoreModelI sm, SimilarityParamsI scoreOptions) { + labels = new ArrayList(); + sequences = sm.expandSeqData(sequences, seqData, scoreOptions, labels); + noseqs = sequences.length; + distances = sm.findDistances(seqData, scoreOptions); makeLeaves(); @@ -177,7 +186,12 @@ public abstract class TreeBuilder extends TreeEngine SequenceNode sn = new SequenceNode(); sn.setElement(sequences[i]); - sn.setName(sequences[i].getName()); + if (labels.size()==noseqs) + { + sn.setName(labels.get(i)); + } else { + sn.setName(sequences[i].getName()); + } node.addElement(sn); BitSet bs = new BitSet(); bs.set(i); diff --git a/src/jalview/analysis/scoremodels/SecondaryStructureDistanceModel.java b/src/jalview/analysis/scoremodels/SecondaryStructureDistanceModel.java index 4e3a3de..468c3c5 100644 --- a/src/jalview/analysis/scoremodels/SecondaryStructureDistanceModel.java +++ b/src/jalview/analysis/scoremodels/SecondaryStructureDistanceModel.java @@ -31,10 +31,13 @@ import jalview.datamodel.SeqCigar; import jalview.datamodel.SequenceI; import jalview.math.Matrix; import jalview.math.MatrixI; +import jalview.util.Constants; import jalview.util.MessageManager; import java.util.ArrayList; +import java.util.HashMap; import java.util.HashSet; +import java.util.List; import java.util.Map; import java.util.Set; @@ -87,6 +90,64 @@ public class SecondaryStructureDistanceModel extends DistanceScoreModel return true; } + ArrayList ssForSeqs = null; + + @Override + public SequenceI[] expandSeqData(SequenceI[] sequences, + AlignmentView seqData, SimilarityParamsI scoreParams,List labels) + { + ssForSeqs = new ArrayList(); + List newSequences = new ArrayList(); + List newCigs = new ArrayList(); + int sq = 0; + + + + AlignmentAnnotation[] alignAnnotList = fr.getViewport().getAlignment() + .getAlignmentAnnotation(); + + String ssSource = scoreParams.getSecondaryStructureSource(); + if(ssSource == null || ssSource == "") { + ssSource = MessageManager.getString("option.ss_providers_all"); + } + + /* + * Add secondary structure annotations that are added to the annotation track + * to the map + */ + Map> ssAlignmentAnnotationForSequences + = AlignmentUtils.getSequenceAssociatedAlignmentAnnotations(alignAnnotList, ssSource); + + for (SeqCigar scig : seqData.getSequences()) + { + SequenceI alSeq = sequences[sq++]; + List ssec = ssAlignmentAnnotationForSequences.get(scig.getRefSeq()); + if (ssec == null) + { + // not defined + newSequences.add(alSeq); + labels.add(alSeq.getName()+"|"+" No Secondary Structure"); + SeqCigar newSeqCigar = scig; //new SeqCigar(scig); + newCigs.add(newSeqCigar); + ssForSeqs.add(null); + } else { + for (int i = 0; i < ssec.size(); i++) + { + labels.add(alSeq.getName()+"|"+AlignmentUtils.extractSSSourceFromAnnotationDescription(ssec.get(i))); + //newSequences.add(seq); + newSequences.add(alSeq); + SeqCigar newSeqCigar = scig; // new SeqCigar(scig); + newCigs.add(newSeqCigar); + ssForSeqs.add(ssec.get(i)); + } + } + } + + seqData.setSequences(newCigs.toArray(new SeqCigar[0])); + return newSequences.toArray(new SequenceI[0]); + + } + /** * Calculates distance score [i][j] between each pair of protein sequences * based on their secondary structure annotations (H, E, C). @@ -105,8 +166,14 @@ public class SecondaryStructureDistanceModel extends DistanceScoreModel @Override public MatrixI findDistances(AlignmentView seqData, SimilarityParamsI params) - { - + { + if (ssForSeqs==null || ssForSeqs.size()!=seqData.getSequences().length) + { + // expandSeqData needs to be called to initialise the hash + SequenceI[] sequences = new SequenceI[seqData.getSequences().length]; + // we throw away the new labels in this case.. + expandSeqData(sequences, seqData, params, new ArrayList()); + } SeqCigar[] seqs = seqData.getSequences(); int noseqs = seqs.length; //no of sequences int cpwidth = 0; @@ -121,17 +188,7 @@ public class SecondaryStructureDistanceModel extends DistanceScoreModel // need to get real position for view position int[] viscont = seqData.getVisibleContigs(); - - AlignmentAnnotation[] alignAnnotList = fr.getViewport().getAlignment() - .getAlignmentAnnotation(); - - - /* - * Add secondary structure annotations that are added to the annotation track - * to the map - */ - Map> ssAlignmentAnnotationForSequences - = AlignmentUtils.getSequenceAssociatedAlignmentAnnotations(alignAnnotList, ssSource); + /* * scan each column, compute and add to each similarity[i, j] @@ -157,59 +214,64 @@ public class SecondaryStructureDistanceModel extends DistanceScoreModel */ for (int i = 0; i < (noseqs - 1); i++) { - //Iterates for each sequences + AlignmentAnnotation aa_i = ssForSeqs.get(i); + boolean undefinedSS1 = aa_i == null; + // check if the sequence contains gap in the current column + boolean gap1 = !seqsWithoutGapAtCol.contains(seqs[i]); + // secondary structure is fetched only if the current column is not + // gap for the sequence + char ss1 = '*'; + if (!gap1 && !undefinedSS1) + { + // fetch the position in sequence for the column and finds the + // corresponding secondary structure annotation + // TO DO - consider based on priority and displayed + int seqPosition_i = seqs[i].findPosition(cpos); + if (aa_i != null) + ss1 = AlignmentUtils.findSSAnnotationForGivenSeqposition(aa_i, + seqPosition_i); + } + // Iterates for each sequences for (int j = i + 1; j < noseqs; j++) { - - //check if ss is defined - boolean undefinedSS1 = ssAlignmentAnnotationForSequences.get(seqs[i].getRefSeq()) == null; - boolean undefinedSS2 = ssAlignmentAnnotationForSequences.get(seqs[j].getRefSeq()) == null; + + // check if ss is defined + AlignmentAnnotation aa_j = ssForSeqs.get(j); + boolean undefinedSS2 = aa_j == null; // Set similarity to max score if both SS are not defined - if (undefinedSS1 && undefinedSS2) { - similarities[i][j] += ssRateMatrix.getMaximumScore(); - continue; - } - + if (undefinedSS1 && undefinedSS2) + { + similarities[i][j] += ssRateMatrix.getMaximumScore(); + continue; + } + // Set similarity to minimum score if either one SS is not defined - else if(undefinedSS1 || undefinedSS2) { - similarities[i][j] += ssRateMatrix.getMinimumScore(); - continue; + else if (undefinedSS1 || undefinedSS2) + { + similarities[i][j] += ssRateMatrix.getMinimumScore(); + continue; } - - //check if the sequence contains gap in the current column - boolean gap1 = !seqsWithoutGapAtCol.contains(seqs[i]); - boolean gap2 = !seqsWithoutGapAtCol.contains(seqs[j]); - - //Variable to store secondary structure at the current column - char ss1 = '*'; + + boolean gap2 = !seqsWithoutGapAtCol.contains(seqs[j]); + + // Variable to store secondary structure at the current column char ss2 = '*'; - - //secondary structure is fetched only if the current column is not - //gap for the sequence - if(!gap1 && !undefinedSS1) { - //fetch the position in sequence for the column and finds the - //corresponding secondary structure annotation - //TO DO - consider based on priority and displayed - int seqPosition = seqs[i].findPosition(cpos); - AlignmentAnnotation aa = ssAlignmentAnnotationForSequences.get(seqs[i].getRefSeq()).get(0); - if(aa!=null) - ss1 = - AlignmentUtils.findSSAnnotationForGivenSeqposition(aa, seqPosition); - } - - if(!gap2 && !undefinedSS2) { + + if (!gap2 && !undefinedSS2) + { int seqPosition = seqs[j].findPosition(cpos); - AlignmentAnnotation aa = ssAlignmentAnnotationForSequences.get(seqs[j].getRefSeq()).get(0); - if(aa!=null) - ss2 = - AlignmentUtils.findSSAnnotationForGivenSeqposition(aa, seqPosition); - } + + if (aa_j != null) + ss2 = AlignmentUtils.findSSAnnotationForGivenSeqposition( + aa_j, seqPosition); + } if ((!gap1 && !gap2) || params.includeGaps()) { // Calculate similarity score based on the substitution matrix - double similarityScore = ssRateMatrix.getPairwiseScore(ss1, ss2); + double similarityScore = ssRateMatrix.getPairwiseScore(ss1, + ss2); similarities[i][j] += similarityScore; } } diff --git a/src/jalview/api/analysis/ScoreModelI.java b/src/jalview/api/analysis/ScoreModelI.java index a243c0c..3613afd 100644 --- a/src/jalview/api/analysis/ScoreModelI.java +++ b/src/jalview/api/analysis/ScoreModelI.java @@ -20,8 +20,11 @@ */ package jalview.api.analysis; +import java.util.List; + import jalview.api.AlignmentViewPanel; import jalview.datamodel.AlignmentView; +import jalview.datamodel.SequenceI; import jalview.math.MatrixI; public interface ScoreModelI @@ -112,4 +115,15 @@ public interface ScoreModelI * @return */ ScoreModelI getInstance(AlignmentViewPanel avp); + + /** + * Score models may create multiple leaves for a single sequence - implement this method if you do + * @param sequences - sequences to be filtered/expanded set of leaves + * @param seqData - origin + * @param labels - strings to show instead of the SequenceI.getName() for each element of sequences attached to leaves + * @return filtered/expanded set of leaves to be analysed + */ + default SequenceI[] expandSeqData(SequenceI[] sequences, AlignmentView seqData, SimilarityParamsI scoreParams, List labels) { + return sequences; + }; } -- 1.7.10.2