JAL-4386 - Implementation of substitution matrix and test cases.

[jalview.git] / src / jalview / analysis / scoremodels / SecondaryStructureDistanceModel.java
diff --git a/src/jalview/analysis/scoremodels/SecondaryStructureDistanceModel.java b/src/jalview/analysis/scoremodels/SecondaryStructureDistanceModel.java

index 1dcf297..5b0f242 100644 (file)
--- a/src/jalview/analysis/scoremodels/SecondaryStructureDistanceModel.java
+++ b/src/jalview/analysis/scoremodels/SecondaryStructureDistanceModel.java
@@ -20,6 +20,7 @@
   */
  package jalview.analysis.scoremodels;
  
+import jalview.analysis.AlignmentUtils;
  import jalview.api.AlignmentViewPanel;
  import jalview.api.FeatureRenderer;
  import jalview.api.analysis.ScoreModelI;
@@ -30,6 +31,7 @@ import jalview.datamodel.Annotation;
  import jalview.datamodel.SeqCigar;
  import jalview.math.Matrix;
  import jalview.math.MatrixI;
+import jalview.util.Constants;
  import jalview.util.SetUtils;
  
  import java.util.HashMap;
@@ -38,51 +40,27 @@ import java.util.Map;
  import java.util.Set;
  
  /* This class contains methods to calculate distance score between 
- * secondary structure annotations of the sequences. The inverse of
- * the score is later calculated for similarity score.
+ * secondary structure annotations of the sequences. 
   */
  public class SecondaryStructureDistanceModel extends DistanceScoreModel
  {
    private static final String NAME = "Secondary Structure Similarity";
-  
-  //label in secondary structure annotation data model from 3d structures
-  private static final String SS_ANNOTATION_LABEL = "Secondary Structure";
-  
-  //label in secondary structure annotation data model from JPred
-  private static final String SS_ANNOTATION_FROM_JPRED_LABEL = "jnetpred";
  
-  //label in secondary structure annotation data model from JPred
-  private static final String JPRED_WEBSERVICE = "JPred";
-  
-  private String description;
-  
-  //maximum distance score is defined as 2 as the possible number of unique secondary structure is 2. 
-  private static final int MAX_SCORE = 2;
+  private ScoreMatrix ssRateMatrix;
    
-  //minimum distance score is defined as 2 as the possible number of unique secondary structure is 2. 
-  private static final int MIN_SCORE = 0;
-   
-  //character used to represent coil in secondary structure
-  private static final char COIL = 'C';
+  private String description;    
    
    FeatureRenderer fr;
    
-  /*
-   * Annotation label for available sources of secondary structure
-   */
-  private static final String[] SS_ANNOTATION_LABELS = {
-      SS_ANNOTATION_LABEL, 
-      SS_ANNOTATION_FROM_JPRED_LABEL 
-  };
-
+  
    /**
     * Constructor
     */
    public SecondaryStructureDistanceModel()
    {
-
+    
    }
-
+  
    @Override
    public ScoreModelI getInstance(AlignmentViewPanel view)
    {
@@ -109,28 +87,11 @@ public class SecondaryStructureDistanceModel extends DistanceScoreModel
      fr = view.cloneFeatureRenderer();
      return true;
    }
-
-  /**
-   * Calculates a distance measure [i][j] between each pair of sequences as the
-   * average number of features they have but do not share. That is, find the
-   * features each sequence pair has at each column, ignore feature types they
-   * have in common, and count the rest. The totals are normalised by the number
-   * of columns processed.
-   * <p>
-   * The parameters argument provides settings for treatment of gap-residue
-   * aligned positions, and whether the score is over the longer or shorter of
-   * each pair of sequences
-   * 
-   * @param seqData
-   * @param params
-   */
    
    /**
     * Calculates distance score [i][j] between each pair of protein sequences 
-   * based on their secondary structure annotations (H, E, C). That is, find the 
-   * secondary structures each sequence has at each column and scores positively for
-   * each non similar secondary structure annotations. Scores 0 for similar secondary 
-   * structure annotations. The final score is normalized by the number of 
+   * based on their secondary structure annotations (H, E, C). 
+   * The final score is normalised by the number of 
     * alignment columns processed, providing an average similarity score.
     * <p>
     * The parameters argument can include settings for handling gap-residue aligned 
@@ -139,7 +100,7 @@ public class SecondaryStructureDistanceModel extends DistanceScoreModel
     * sequences of significantly different lengths.
     * 
     * @param seqData  The aligned sequence data including secondary structure annotations.
-   * @param params   Additional parameters for customizing the scoring process, such as gap 
+   * @param params   Additional parameters for customising the scoring process, such as gap 
     *                 handling and sequence length consideration.
     */
    @Override
@@ -150,16 +111,18 @@ public class SecondaryStructureDistanceModel extends DistanceScoreModel
      SeqCigar[] seqs = seqData.getSequences();
      int noseqs = seqs.length; //no of sequences
      int cpwidth = 0; // = seqData.getWidth();
-    double[][] distances = new double[noseqs][noseqs]; //matrix to store distance score
-    double[][] substitutionMatrix = getSubstitutionMatrix();
+    double[][] similarities = new double[noseqs][noseqs]; //matrix to store similarity score
      //secondary structure source parameter selected by the user from the drop down.
      String ssSource = params.getSecondaryStructureSource(); 
+    ssRateMatrix = ScoreModels.getInstance().getSecondaryStructureMatrix();
      
      //defining the default value for secondary structure source as 3d structures 
      //or JPred if user selected JPred
-    String selectedSSSource = SS_ANNOTATION_LABEL;
-    if(ssSource.equals(JPRED_WEBSERVICE))
-      selectedSSSource = SS_ANNOTATION_FROM_JPRED_LABEL;
+    String selectedSSSource = Constants.SS_ANNOTATION_LABEL;
+    if(ssSource.equals(Constants.SECONDARY_STRUCTURE_LABELS.get(Constants.SS_ANNOTATION_FROM_JPRED_LABEL)))
+    {
+      selectedSSSource = Constants.SS_ANNOTATION_FROM_JPRED_LABEL;
+    }
          
      // need to get real position for view position
      int[] viscont = seqData.getVisibleContigs();
@@ -196,7 +159,7 @@ public class SecondaryStructureDistanceModel extends DistanceScoreModel
          = findSeqsWithUndefinedSS(seqs, ssAlignmentAnnotationForSequences);
  
      /*
-     * scan each column, compute and add to each distance[i, j]
+     * scan each column, compute and add to each similarity[i, j]
       * the number of secondary structure annotation that seqi 
       * and seqj do not share
       */
@@ -205,7 +168,7 @@ public class SecondaryStructureDistanceModel extends DistanceScoreModel
        //Iterates for each column position
        for (int cpos = viscont[vc]; cpos <= viscont[vc + 1]; cpos++) 
        {
-        cpwidth++; //used to normalise the distance score 
+        cpwidth++; //used to normalise the similarity score 
  
          /*
           * get set of sequences without gap in the current column
@@ -213,8 +176,8 @@ public class SecondaryStructureDistanceModel extends DistanceScoreModel
          Set<SeqCigar> seqsWithoutGapAtCol = findSeqsWithoutGapAtColumn(seqs, cpos);
  
          /*
-         * count score for each dissimilar secondary structure annotation on i'th and j'th
-         * sequence. Igonre if similar and add this 'distance' measure to the total 
+         * calculate similarity score for each secondary structure annotation on i'th and j'th
+         * sequence and add this measure to the similarities matrix 
           * for [i, j] for j > i
           */
          for (int i = 0; i < (noseqs - 1); i++)
@@ -230,15 +193,15 @@ public class SecondaryStructureDistanceModel extends DistanceScoreModel
              boolean undefinedSS1 = seqsWithUndefinedSS.contains(sc1);
              boolean undefinedSS2 = seqsWithUndefinedSS.contains(sc2);
  
-            // Set distance to 0 if both SS are not defined
+            // Set similarity to max score if both SS are not defined
              if (undefinedSS1 && undefinedSS2) {
-                distances[i][j] += MIN_SCORE; 
+                similarities[i][j] += ssRateMatrix.getMaximumScore();
                  continue;
              } 
              
-            // Set distance to maximum score if either one SS is not defined
+            // Set similarity to minimum score if either one SS is not defined
              else if(undefinedSS1 || undefinedSS2) {
-                distances[i][j] += MAX_SCORE;
+                similarities[i][j] += ssRateMatrix.getMinimumScore();
                  continue;
              }
              
@@ -247,31 +210,33 @@ public class SecondaryStructureDistanceModel extends DistanceScoreModel
              boolean gap2 = !seqsWithoutGapAtCol.contains(sc2);            
              
              //Variable to store secondary structure at the current column
-            char ss1 = 'G', ss2 = 'G';
+            char ss1 = '*';
+            char ss2 = '*';
              
              //secondary structure is fetched only if the current column is not 
              //gap for the sequence
-            if(!gap1 && !undefinedSS1) {              
+            if(!gap1 && !undefinedSS1) {  
+              //fetch the position in sequence for the column and finds the
+              //corresponding secondary structure annotation
+              //TO DO - consider based on priority
+              int seqPosition = seqs[i].findPosition(cpos);
+              AlignmentAnnotation[] aa = seqs[i].getRefSeq().getAnnotation(selectedSSSource);
                ss1 = 
-                  findSSAnnotationForGivenSeqAndCol(seqs[i], cpos);              
+                  AlignmentUtils.findSSAnnotationForGivenSeqposition(aa, seqPosition);              
              }
              
              if(!gap2 && !undefinedSS2) {              
-              ss2 =
-                  findSSAnnotationForGivenSeqAndCol(seqs[j], cpos);              
+              int seqPosition = seqs[j].findPosition(cpos);
+              AlignmentAnnotation[] aa = seqs[j].getRefSeq().getAnnotation(selectedSSSource);
+              ss2 = 
+                  AlignmentUtils.findSSAnnotationForGivenSeqposition(aa, seqPosition);               
              }           
  
-            /*
-             * gap-gap scores zero
-             * similar ss-ss scores zero
-             * different ss-ss scores 1
-             * gap-ss scores 1 if params say to do so
-             */
              if ((!gap1 && !gap2) || params.includeGaps())
              {
-              // Calculate distance score based on the substitution matrix
-              double seqDistance = substitutionMatrix[getSubstitutionMatrixIndex(ss1)][getSubstitutionMatrixIndex(ss2)];
-              distances[i][j] += seqDistance;
+              // Calculate similarity score based on the substitution matrix
+              double similarityScore = ssRateMatrix.getPairwiseScore(ss1, ss2);
+              similarities[i][j] += similarityScore;
              }
            }
          }
@@ -279,20 +244,22 @@ public class SecondaryStructureDistanceModel extends DistanceScoreModel
      }
  
      /*
-     * normalise the distance scores (summed over columns) by the
+     * normalise the similarity scores (summed over columns) by the
       * number of visible columns used in the calculation
       * and fill in the bottom half of the matrix
       */
      // TODO JAL-2424 cpwidth may be out by 1 - affects scores but not tree shape
+    
      for (int i = 0; i < noseqs; i++)
      {
        for (int j = i + 1; j < noseqs; j++)
-      {
-        distances[i][j] /= cpwidth;
-        distances[j][i] = distances[i][j];
+      {        
+        similarities[i][j] /= cpwidth;
+        similarities[j][i] = similarities[i][j];
        }
      }
-    return new Matrix(distances);
+    return ssRateMatrix.similarityToDistance(new Matrix(similarities));
+    
    }
  
    /**
@@ -304,7 +271,7 @@ public class SecondaryStructureDistanceModel extends DistanceScoreModel
     *          (0..)
     * @return
     */
-  protected Set<SeqCigar> findSeqsWithoutGapAtColumn(
+  private Set<SeqCigar> findSeqsWithoutGapAtColumn(
            SeqCigar[] seqs, int columnPosition)
    {
      Set<SeqCigar> seqsWithoutGapAtCol = new HashSet<>();
@@ -335,7 +302,7 @@ public class SecondaryStructureDistanceModel extends DistanceScoreModel
     * @param ssAlignmentAnnotationForSequences         
     * @return
     */
-  protected Set<SeqCigar> findSeqsWithUndefinedSS(SeqCigar[] seqs,
+  private Set<SeqCigar> findSeqsWithUndefinedSS(SeqCigar[] seqs,
            Map<String, HashSet<String>> ssAlignmentAnnotationForSequences) {
        Set<SeqCigar> seqsWithUndefinedSS = new HashSet<>();
        for (SeqCigar seq : seqs) {
@@ -361,103 +328,27 @@ public class SecondaryStructureDistanceModel extends DistanceScoreModel
     */
    private boolean isSSUndefinedOrNotAdded(SeqCigar seq, 
            Map<String, HashSet<String>> ssAlignmentAnnotationForSequences) {
-      for (String label : SS_ANNOTATION_LABELS) {
+      for (String label : Constants.SECONDARY_STRUCTURE_LABELS.keySet()) {
            AlignmentAnnotation[] annotations = seq.getRefSeq().getAnnotation(label);
            if (annotations != null) {
                for (AlignmentAnnotation annotation : annotations) {                
                  HashSet<String> descriptionSet = ssAlignmentAnnotationForSequences
                          .get(annotation.sequenceRef.getName());
                  if (descriptionSet != null)
+                {
                    if (descriptionSet.contains(annotation.description)) {
                        // Secondary structure annotation is present and 
                        //added to the track, no need to add seq
                        return false;
                    }
+                }
                }
            }
        }
        // Either annotations are undefined or not added to the track
        return true;
    }
-  
-  
-  /**
-   * Finds secondary structure annotation for a given sequence (SeqCigar) 
-   * and column position corresponding to the sequence.
-   * 
-   * @param seq
-   * @param columnPosition
-   *          (0..)
-   * @return
-   */
-  private char findSSAnnotationForGivenSeqAndCol(
-      SeqCigar seq, int columnPosition) 
-  {      
-    char ss = 'G'; 
-    
-    //fetch the position in sequence for the column and finds the
-    //corresponding secondary structure annotation
-    //TO DO - consider based on priority
-    int seqPosition = seq.findPosition(columnPosition);
-    AlignmentAnnotation[] aa = seq.getRefSeq().getAnnotation(SS_ANNOTATION_LABEL);
-    
-    if(aa == null) {
-      aa = seq.getRefSeq().getAnnotation(SS_ANNOTATION_FROM_JPRED_LABEL);
-    }
-    
-    if (aa != null) {
-      if (aa[0].getAnnotationForPosition(seqPosition) != null) {
-        Annotation a = aa[0].getAnnotationForPosition(seqPosition);
-        ss = a.secondaryStructure;
-        
-        //There is no representation for coil and it can be either ' ' or null. 
-        if (ss == ' ' || ss == '-') {
-          ss = COIL; 
-        }
-      }
-      else {
-        ss = COIL;
-      }
-                 
-    }
-    
-    return ss;
-  }
-  
-  /**
-   * Retrieve the substitution matrix.
-   *
-   * @return The substitution matrix.
-   */
-  private double[][] getSubstitutionMatrix() {
-      // Defining the substitution matrix 
-      // This matrix map distance scores between secondary structure symbols
      
-      return new double[][]{
-              // C   E   H  G
-              {0.0, 1.0, 1.0, 1.0}, // C - COIL
-              {1.0, 0.0, 1.0, 1.0}, // E - SHEET
-              {1.0, 1.0, 0.0, 1.0}, // H - HELIX
-              {1.0, 1.0, 1.0, 0.0} // G - GAP
-              
-      };
-  }
-  
-  private int getSubstitutionMatrixIndex(char ss) {
-    switch (ss) {
-        case 'C':
-            return 0;
-        case 'E':
-            return 1;
-        case 'H':
-            return 2;
-        case 'G':
-          return 3;
-        default:
-            throw new IllegalArgumentException("Invalid secondary structure character: " + ss);
-    }
-  }
-
    @Override
    public String getName()
    {
@@ -491,7 +382,7 @@ public class SecondaryStructureDistanceModel extends DistanceScoreModel
    @Override
    public String toString()
    {
-    return "Score between sequences based on hamming distance between binary "
+    return "Score between sequences based on similarity between binary "
              + "vectors marking secondary structure displayed at each column";
    }
  }
\ No newline at end of file