JAL-4386 allow score models to generate additional labelled entities to cluster ...
authorJim Procter <jprocter@dundee.ac.uk>
Thu, 27 Jun 2024 16:36:47 +0000 (17:36 +0100)
committerJim Procter <jprocter@dundee.ac.uk>
Thu, 27 Jun 2024 16:36:47 +0000 (17:36 +0100)
src/jalview/analysis/TreeBuilder.java
src/jalview/analysis/scoremodels/SecondaryStructureDistanceModel.java
src/jalview/api/analysis/ScoreModelI.java

index 61f65ff..560a0a4 100644 (file)
@@ -28,9 +28,12 @@ import jalview.datamodel.CigarArray;
 import jalview.datamodel.SeqCigar;
 import jalview.datamodel.SequenceI;
 import jalview.datamodel.SequenceNode;
+import jalview.util.MessageManager;
 import jalview.viewmodel.AlignmentViewport;
 
+import java.util.ArrayList;
 import java.util.BitSet;
+import java.util.List;
 import java.util.Vector;
 
 public abstract class TreeBuilder extends TreeEngine
@@ -40,6 +43,7 @@ public abstract class TreeBuilder extends TreeEngine
   public static final String NEIGHBOUR_JOINING = "NJ";
 
   protected SequenceI[] sequences;
+  protected List<String> labels;
 
   public AlignmentView seqData;
 
@@ -74,7 +78,7 @@ public abstract class TreeBuilder extends TreeEngine
     }
 
     init(seqStrings, start, end);
-
+    
     computeTree(sm, scoreParameters);
   }
 
@@ -83,6 +87,7 @@ public abstract class TreeBuilder extends TreeEngine
     return sequences;
   }
 
+  
   /**
    * 
    * @return true if tree has real distances
@@ -121,6 +126,10 @@ public abstract class TreeBuilder extends TreeEngine
    */
   protected void computeTree(ScoreModelI sm, SimilarityParamsI scoreOptions)
   {
+    labels = new ArrayList<String>();
+    sequences = sm.expandSeqData(sequences, seqData, scoreOptions, labels);
+    noseqs = sequences.length;
+    
     distances = sm.findDistances(seqData, scoreOptions);
 
     makeLeaves();
@@ -177,7 +186,12 @@ public abstract class TreeBuilder extends TreeEngine
       SequenceNode sn = new SequenceNode();
 
       sn.setElement(sequences[i]);
-      sn.setName(sequences[i].getName());
+      if (labels.size()==noseqs)
+      {
+        sn.setName(labels.get(i));
+      } else {
+        sn.setName(sequences[i].getName());
+      }
       node.addElement(sn);
       BitSet bs = new BitSet();
       bs.set(i);
index 4e3a3de..468c3c5 100644 (file)
@@ -31,10 +31,13 @@ import jalview.datamodel.SeqCigar;
 import jalview.datamodel.SequenceI;
 import jalview.math.Matrix;
 import jalview.math.MatrixI;
+import jalview.util.Constants;
 import jalview.util.MessageManager;
 
 import java.util.ArrayList;
+import java.util.HashMap;
 import java.util.HashSet;
+import java.util.List;
 import java.util.Map;
 import java.util.Set;
 
@@ -87,6 +90,64 @@ public class SecondaryStructureDistanceModel extends DistanceScoreModel
     return true;
   }
   
+  ArrayList<AlignmentAnnotation> ssForSeqs = null;
+
+  @Override
+  public SequenceI[] expandSeqData(SequenceI[] sequences,
+          AlignmentView seqData, SimilarityParamsI scoreParams,List<String> labels)
+  {
+    ssForSeqs = new ArrayList<AlignmentAnnotation>();
+    List<SequenceI> newSequences = new ArrayList<SequenceI>();
+    List<SeqCigar> newCigs = new ArrayList<SeqCigar>();
+    int sq = 0;
+    
+    
+
+    AlignmentAnnotation[] alignAnnotList = fr.getViewport().getAlignment()
+            .getAlignmentAnnotation();   
+    
+    String ssSource = scoreParams.getSecondaryStructureSource(); 
+    if(ssSource == null || ssSource == "") {
+      ssSource = MessageManager.getString("option.ss_providers_all");
+    }
+    
+    /*
+     * Add secondary structure annotations that are added to the annotation track
+     * to the map
+     */
+    Map<SequenceI, ArrayList<AlignmentAnnotation>> ssAlignmentAnnotationForSequences 
+      = AlignmentUtils.getSequenceAssociatedAlignmentAnnotations(alignAnnotList, ssSource);
+    
+    for (SeqCigar scig : seqData.getSequences())
+    {
+      SequenceI alSeq = sequences[sq++];
+      List<AlignmentAnnotation> ssec = ssAlignmentAnnotationForSequences.get(scig.getRefSeq());
+      if (ssec == null)
+      {
+        // not defined
+        newSequences.add(alSeq);
+        labels.add(alSeq.getName()+"|"+" No Secondary Structure");
+        SeqCigar newSeqCigar = scig; //new SeqCigar(scig);
+        newCigs.add(newSeqCigar);
+        ssForSeqs.add(null);
+      } else {
+        for (int i = 0; i < ssec.size(); i++)
+        {
+          labels.add(alSeq.getName()+"|"+AlignmentUtils.extractSSSourceFromAnnotationDescription(ssec.get(i)));
+          //newSequences.add(seq);
+          newSequences.add(alSeq);
+          SeqCigar newSeqCigar = scig; // new SeqCigar(scig);
+          newCigs.add(newSeqCigar);
+          ssForSeqs.add(ssec.get(i));
+        }
+      }
+    }
+
+    seqData.setSequences(newCigs.toArray(new SeqCigar[0]));
+    return newSequences.toArray(new SequenceI[0]);
+
+  }
+
   /**
    * Calculates distance score [i][j] between each pair of protein sequences 
    * based on their secondary structure annotations (H, E, C). 
@@ -105,8 +166,14 @@ public class SecondaryStructureDistanceModel extends DistanceScoreModel
   @Override
   public MatrixI findDistances(AlignmentView seqData,
           SimilarityParamsI params)
-  {   
-    
+  { 
+    if (ssForSeqs==null || ssForSeqs.size()!=seqData.getSequences().length)
+    {
+      // expandSeqData needs to be called to initialise the hash
+      SequenceI[] sequences = new SequenceI[seqData.getSequences().length];
+      // we throw away the new labels in this case..
+      expandSeqData(sequences, seqData, params, new ArrayList<String>());
+    }
     SeqCigar[] seqs = seqData.getSequences();
     int noseqs = seqs.length; //no of sequences
     int cpwidth = 0; 
@@ -121,17 +188,7 @@ public class SecondaryStructureDistanceModel extends DistanceScoreModel
     // need to get real position for view position
     int[] viscont = seqData.getVisibleContigs();
     
-    
-    AlignmentAnnotation[] alignAnnotList = fr.getViewport().getAlignment()
-            .getAlignmentAnnotation();   
-    
-
-    /*
-     * Add secondary structure annotations that are added to the annotation track
-     * to the map
-     */
-    Map<SequenceI, ArrayList<AlignmentAnnotation>> ssAlignmentAnnotationForSequences 
-      = AlignmentUtils.getSequenceAssociatedAlignmentAnnotations(alignAnnotList, ssSource); 
+       
 
     /*
      * scan each column, compute and add to each similarity[i, j]
@@ -157,59 +214,64 @@ public class SecondaryStructureDistanceModel extends DistanceScoreModel
          */
         for (int i = 0; i < (noseqs - 1); i++)
         {
-          //Iterates for each sequences
+          AlignmentAnnotation aa_i = ssForSeqs.get(i);
+          boolean undefinedSS1 = aa_i == null;
+          // check if the sequence contains gap in the current column
+          boolean gap1 = !seqsWithoutGapAtCol.contains(seqs[i]);
+          // secondary structure is fetched only if the current column is not
+          // gap for the sequence
+          char ss1 = '*';
+          if (!gap1 && !undefinedSS1)
+          {
+            // fetch the position in sequence for the column and finds the
+            // corresponding secondary structure annotation
+            // TO DO - consider based on priority and displayed
+            int seqPosition_i = seqs[i].findPosition(cpos);
+            if (aa_i != null)
+              ss1 = AlignmentUtils.findSSAnnotationForGivenSeqposition(aa_i,
+                      seqPosition_i);
+          }
+          // Iterates for each sequences
           for (int j = i + 1; j < noseqs; j++)
           {
-                         
-            //check if ss is defined
-            boolean undefinedSS1 = ssAlignmentAnnotationForSequences.get(seqs[i].getRefSeq()) == null;
-            boolean undefinedSS2 = ssAlignmentAnnotationForSequences.get(seqs[j].getRefSeq()) == null;
+
+            // check if ss is defined
+            AlignmentAnnotation aa_j = ssForSeqs.get(j);
+            boolean undefinedSS2 = aa_j == null;
 
             // Set similarity to max score if both SS are not defined
-            if (undefinedSS1 && undefinedSS2) {
-                similarities[i][j] += ssRateMatrix.getMaximumScore();
-                continue;
-            } 
-            
+            if (undefinedSS1 && undefinedSS2)
+            {
+              similarities[i][j] += ssRateMatrix.getMaximumScore();
+              continue;
+            }
+
             // Set similarity to minimum score if either one SS is not defined
-            else if(undefinedSS1 || undefinedSS2) {
-                similarities[i][j] += ssRateMatrix.getMinimumScore();
-                continue;
+            else if (undefinedSS1 || undefinedSS2)
+            {
+              similarities[i][j] += ssRateMatrix.getMinimumScore();
+              continue;
             }
-            
-            //check if the sequence contains gap in the current column
-            boolean gap1 = !seqsWithoutGapAtCol.contains(seqs[i]);
-            boolean gap2 = !seqsWithoutGapAtCol.contains(seqs[j]);            
-            
-            //Variable to store secondary structure at the current column
-            char ss1 = '*';
+
+            boolean gap2 = !seqsWithoutGapAtCol.contains(seqs[j]);
+
+            // Variable to store secondary structure at the current column
             char ss2 = '*';
-            
-            //secondary structure is fetched only if the current column is not 
-            //gap for the sequence
-            if(!gap1 && !undefinedSS1) {  
-              //fetch the position in sequence for the column and finds the
-              //corresponding secondary structure annotation
-              //TO DO - consider based on priority and displayed
-              int seqPosition = seqs[i].findPosition(cpos);
-              AlignmentAnnotation aa = ssAlignmentAnnotationForSequences.get(seqs[i].getRefSeq()).get(0);
-              if(aa!=null)
-              ss1 = 
-                  AlignmentUtils.findSSAnnotationForGivenSeqposition(aa, seqPosition);              
-            }
-            
-            if(!gap2 && !undefinedSS2) {              
+
+            if (!gap2 && !undefinedSS2)
+            {
               int seqPosition = seqs[j].findPosition(cpos);
-              AlignmentAnnotation aa = ssAlignmentAnnotationForSequences.get(seqs[j].getRefSeq()).get(0);
-              if(aa!=null)
-                ss2 = 
-                  AlignmentUtils.findSSAnnotationForGivenSeqposition(aa, seqPosition);               
-            }           
+
+              if (aa_j != null)
+                ss2 = AlignmentUtils.findSSAnnotationForGivenSeqposition(
+                        aa_j, seqPosition);
+            }
 
             if ((!gap1 && !gap2) || params.includeGaps())
             {
               // Calculate similarity score based on the substitution matrix
-              double similarityScore = ssRateMatrix.getPairwiseScore(ss1, ss2);
+              double similarityScore = ssRateMatrix.getPairwiseScore(ss1,
+                      ss2);
               similarities[i][j] += similarityScore;
             }
           }
index a243c0c..3613afd 100644 (file)
  */
 package jalview.api.analysis;
 
+import java.util.List;
+
 import jalview.api.AlignmentViewPanel;
 import jalview.datamodel.AlignmentView;
+import jalview.datamodel.SequenceI;
 import jalview.math.MatrixI;
 
 public interface ScoreModelI
@@ -112,4 +115,15 @@ public interface ScoreModelI
    * @return
    */
   ScoreModelI getInstance(AlignmentViewPanel avp);
+
+  /**
+   * Score models may create multiple leaves for a single sequence - implement this method if you do
+   * @param sequences - sequences to be filtered/expanded set of leaves
+   * @param seqData - origin
+   * @param labels - strings to show instead of the SequenceI.getName() for each element of sequences attached to leaves
+   * @return filtered/expanded set of leaves to be analysed
+   */
+  default SequenceI[] expandSeqData(SequenceI[] sequences, AlignmentView seqData, SimilarityParamsI scoreParams, List<String> labels) { 
+    return sequences; 
+  };
 }