JAL-1473 refactor score matrices and tree score calculations to interface/api and...
authorJim Procter <jprocter@dundee.ac.uk>
Tue, 15 Apr 2014 15:09:20 +0000 (16:09 +0100)
committerJim Procter <jprocter@dundee.ac.uk>
Mon, 21 Apr 2014 13:09:16 +0000 (14:09 +0100)
src/jalview/analysis/NJTree.java
src/jalview/analysis/PCA.java
src/jalview/analysis/scoremodels/PIDScoreModel.java [new file with mode: 0644]
src/jalview/analysis/scoremodels/PairwiseSeqScoreModel.java [new file with mode: 0644]
src/jalview/analysis/scoremodels/SWScoreModel.java [new file with mode: 0644]
src/jalview/api/analysis/ScoreModelI.java [new file with mode: 0644]
src/jalview/datamodel/AlignmentView.java
src/jalview/schemes/ResidueProperties.java
src/jalview/schemes/ScoreMatrix.java
test/jalview/schemes/ScoreMatrixPrinter.java

index 944354f..41d599e 100644 (file)
@@ -20,6 +20,7 @@ package jalview.analysis;
 
 import java.util.*;
 
+import jalview.api.analysis.ScoreModelI;
 import jalview.datamodel.*;
 import jalview.io.*;
 import jalview.schemes.*;
@@ -254,8 +255,7 @@ public class NJTree
 
     noseqs = i++;
 
-    distance = findDistances(this.seqData
-            .getSequenceStrings(Comparison.GapChars.charAt(0)));
+    distance = findDistances();
     // System.err.println("Made distances");// dbg
     makeLeaves();
     // System.err.println("Made leaves");// dbg
@@ -716,100 +716,25 @@ public class NJTree
   }
 
   /**
-   * DOCUMENT ME!
+   * Calculate a distance matrix given the sequence input data and score model
    * 
-   * @return DOCUMENT ME!
+   * @return similarity matrix used to compute tree
    */
-  public float[][] findDistances(String[] sequenceString)
+  public float[][] findDistances()
   {
+    
     float[][] distance = new float[noseqs][noseqs];
 
-    if (pwtype.equals("PID"))
-    {
-      for (int i = 0; i < (noseqs - 1); i++)
-      {
-        for (int j = i; j < noseqs; j++)
-        {
-          if (j == i)
-          {
-            distance[i][i] = 0;
-          }
-          else
-          {
-            distance[i][j] = 100 - Comparison.PID(sequenceString[i],
-                    sequenceString[j]);
-
-            distance[j][i] = distance[i][j];
-          }
-        }
-      }
-    }
-    else
-    {
       // Pairwise substitution score (with no gap penalties)
-      ScoreMatrix pwmatrix = ResidueProperties.getScoreMatrix(pwtype);
-      if (pwmatrix == null)
+      ScoreModelI _pwmatrix = ResidueProperties.getScoreModel(pwtype);
+      if (_pwmatrix == null)
       {
-        pwmatrix = ResidueProperties.getScoreMatrix("BLOSUM62");
+        _pwmatrix = ResidueProperties.getScoreMatrix("BLOSUM62");
       }
-      int maxscore = 0;
-      int end = sequenceString[0].length();
-      for (int i = 0; i < (noseqs - 1); i++)
-      {
-        for (int j = i; j < noseqs; j++)
-        {
-          int score = 0;
-
-          for (int k = 0; k < end; k++)
-          {
-            try
-            {
-              score += pwmatrix.getPairwiseScore(
-                      sequenceString[i].charAt(k),
-                      sequenceString[j].charAt(k));
-            } catch (Exception ex)
-            {
-              System.err.println("err creating BLOSUM62 tree");
-              ex.printStackTrace();
-            }
-          }
-
-          distance[i][j] = (float) score;
-
-          if (score > maxscore)
-          {
-            maxscore = score;
-          }
-        }
-      }
-
-      for (int i = 0; i < (noseqs - 1); i++)
-      {
-        for (int j = i; j < noseqs; j++)
-        {
-          distance[i][j] = (float) maxscore - distance[i][j];
-          distance[j][i] = distance[i][j];
-        }
-      }
-
-    }
+      distance = _pwmatrix.findDistances(seqData);
     return distance;
 
-    // else
-    /*
-     * else if (pwtype.equals("SW")) { float max = -1;
-     * 
-     * for (int i = 0; i < (noseqs - 1); i++) { for (int j = i; j < noseqs; j++)
-     * { AlignSeq as = new AlignSeq(sequence[i], sequence[j], "pep");
-     * as.calcScoreMatrix(); as.traceAlignment(); as.printAlignment(System.out);
-     * distance[i][j] = (float) as.maxscore;
-     * 
-     * if (max < distance[i][j]) { max = distance[i][j]; } } }
-     * 
-     * for (int i = 0; i < (noseqs - 1); i++) { for (int j = i; j < noseqs; j++)
-     * { distance[i][j] = max - distance[i][j]; distance[j][i] = distance[i][j];
-     * } } }/
-     */
+
   }
 
   /**
index 89c6353..979968f 100755 (executable)
@@ -70,6 +70,10 @@ public class PCA implements Runnable
    */
   public PCA(String[] s, boolean nucleotides)
   {
+    this(s, nucleotides, null);
+  }
+  public PCA(String[] s, boolean nucleotides, String s_m)
+  {
 
     BinarySequence[] bs = new BinarySequence[s.length];
     int ii = 0;
@@ -83,9 +87,17 @@ public class PCA implements Runnable
 
     BinarySequence[] bs2 = new BinarySequence[s.length];
     ii = 0;
-
-    String sm = nucleotides ? "DNA" : "BLOSUM62";
-    ScoreMatrix smtrx = ResidueProperties.getScoreMatrix(sm);
+    ScoreMatrix smtrx = null;
+    String sm=s_m;
+    if (sm!=null)
+    {
+      smtrx = ResidueProperties.getScoreMatrix(sm);
+    }
+    if (smtrx==null)
+    {
+      // either we were given a non-existent score matrix or a scoremodel that isn't based on a pairwise symbol score matrix
+      smtrx = ResidueProperties.getScoreMatrix(sm=(nucleotides ? "DNA" : "BLOSUM62"));
+    }
     details.append("PCA calculation using " + sm
             + " sequence similarity matrix\n========\n\n");
     while ((ii < s.length) && (s[ii] != null))
diff --git a/src/jalview/analysis/scoremodels/PIDScoreModel.java b/src/jalview/analysis/scoremodels/PIDScoreModel.java
new file mode 100644 (file)
index 0000000..2069b50
--- /dev/null
@@ -0,0 +1,55 @@
+package jalview.analysis.scoremodels;
+
+import jalview.api.analysis.ScoreModelI;
+import jalview.datamodel.AlignmentView;
+import jalview.util.Comparison;
+
+public class PIDScoreModel implements ScoreModelI
+{
+
+  @Override
+  public float[][] findDistances(AlignmentView seqData)
+  {
+    String[] sequenceString = seqData
+            .getSequenceStrings(Comparison.GapChars.charAt(0));
+    int noseqs = sequenceString.length;
+    float[][] distance = new float[noseqs][noseqs];
+    for (int i = 0; i < (noseqs - 1); i++)
+    {
+      for (int j = i; j < noseqs; j++)
+      {
+        if (j == i)
+        {
+          distance[i][i] = 0;
+        }
+        else
+        {
+          distance[i][j] = 100 - Comparison.PID(sequenceString[i],
+                  sequenceString[j]);
+
+          distance[j][i] = distance[i][j];
+        }
+      }
+    }
+    return distance;
+  }
+
+  @Override
+  public String getName()
+  {
+    return "PID";
+  }
+
+  @Override
+  public boolean isDNA()
+  {
+    return true;
+  }
+
+  @Override
+  public boolean isProtein()
+  {
+    return true;
+  }
+
+}
diff --git a/src/jalview/analysis/scoremodels/PairwiseSeqScoreModel.java b/src/jalview/analysis/scoremodels/PairwiseSeqScoreModel.java
new file mode 100644 (file)
index 0000000..78c5f17
--- /dev/null
@@ -0,0 +1,61 @@
+package jalview.analysis.scoremodels;
+
+import jalview.api.analysis.ScoreModelI;
+import jalview.datamodel.AlignmentView;
+import jalview.schemes.ScoreMatrix;
+import jalview.util.Comparison;
+
+public abstract class PairwiseSeqScoreModel implements ScoreModelI
+{
+  abstract public int getPairwiseScore(char c, char d);
+
+  public float[][] findDistances(AlignmentView seqData)
+  {
+    String[] sequenceString = seqData
+            .getSequenceStrings(Comparison.GapChars.charAt(0));
+    int noseqs = sequenceString.length;
+    float[][] distance = new float[noseqs][noseqs];
+
+    int maxscore = 0;
+    int end = sequenceString[0].length();
+    for (int i = 0; i < (noseqs - 1); i++)
+    {
+      for (int j = i; j < noseqs; j++)
+      {
+        int score = 0;
+
+        for (int k = 0; k < end; k++)
+        {
+          try
+          {
+            score += getPairwiseScore(sequenceString[i].charAt(k),
+                    sequenceString[j].charAt(k));
+          } catch (Exception ex)
+          {
+            System.err.println("err creating " + getName() + " tree");
+            ex.printStackTrace();
+          }
+        }
+
+        distance[i][j] = (float) score;
+
+        if (score > maxscore)
+        {
+          maxscore = score;
+        }
+      }
+    }
+
+    for (int i = 0; i < (noseqs - 1); i++)
+    {
+      for (int j = i; j < noseqs; j++)
+      {
+        distance[i][j] = (float) maxscore - distance[i][j];
+        distance[j][i] = distance[i][j];
+      }
+    }
+    return distance;
+  }
+
+  abstract public int[][] getMatrix();
+}
\ No newline at end of file
diff --git a/src/jalview/analysis/scoremodels/SWScoreModel.java b/src/jalview/analysis/scoremodels/SWScoreModel.java
new file mode 100644 (file)
index 0000000..d8c6230
--- /dev/null
@@ -0,0 +1,54 @@
+package jalview.analysis.scoremodels;
+
+import jalview.analysis.AlignSeq;
+import jalview.api.analysis.ScoreModelI;
+import jalview.datamodel.AlignmentView;
+import jalview.datamodel.SequenceI;
+import jalview.util.Comparison;
+
+public class SWScoreModel implements ScoreModelI
+{
+
+  @Override
+  public float[][] findDistances(AlignmentView seqData)
+  {
+    SequenceI[] sequenceString = seqData
+            .getVisibleAlignment(Comparison.GapChars.charAt(0)).getSequencesArray();
+    int noseqs = sequenceString.length;
+    float[][] distance = new float[noseqs][noseqs];
+    
+     float max = -1;
+      
+      for (int i = 0; i < (noseqs - 1); i++) { for (int j = i; j < noseqs; j++)
+      { AlignSeq as = new AlignSeq(sequenceString[i], sequenceString[j], seqData.isNa() ? "dna" : "pep");
+      as.calcScoreMatrix(); as.traceAlignment(); as.printAlignment(System.out);
+      distance[i][j] = (float) as.maxscore;
+      
+      if (max < distance[i][j]) { max = distance[i][j]; } } }
+     
+      for (int i = 0; i < (noseqs - 1); i++) { for (int j = i; j < noseqs; j++)
+      { distance[i][j] = max - distance[i][j]; distance[j][i] = distance[i][j];
+      } } 
+     
+    return distance;
+  }
+
+  @Override
+  public String getName()
+  {
+    return "Smith Waterman Score";
+  }
+  @Override
+  public boolean isDNA()
+  {
+    return true;
+  }
+  @Override
+  public boolean isProtein()
+  {
+    return true;
+  }
+  public String toString() {
+    return "Score between two sequences aligned with Smith Waterman with default Peptide/Nucleotide matrix";
+  }
+}
diff --git a/src/jalview/api/analysis/ScoreModelI.java b/src/jalview/api/analysis/ScoreModelI.java
new file mode 100644 (file)
index 0000000..0d56033
--- /dev/null
@@ -0,0 +1,16 @@
+package jalview.api.analysis;
+
+import jalview.datamodel.AlignmentView;
+
+public interface ScoreModelI
+{
+
+  float[][] findDistances(AlignmentView seqData);
+
+  String getName();
+
+  boolean isDNA();
+
+  boolean isProtein();
+
+}
index ea0fbe0..273a685 100644 (file)
@@ -46,6 +46,16 @@ public class AlignmentView
    */
   private Vector scGroups;
 
+  private boolean isNa=false;
+  /**
+   * false if the view concerns peptides
+   * @return
+   */
+  public boolean isNa()
+  {
+    return isNa;
+  }
+
   /**
    * Group defined over SeqCigars. Unlike AlignmentI associated groups, each
    * SequenceGroup hold just the essential properties for the group, but no
@@ -99,6 +109,7 @@ public class AlignmentView
             (selectedRegionOnly ? selection : null)),
             (selectedRegionOnly && selection != null) ? selection
                     .getStartRes() : 0);
+    isNa = alignment.isNucleotide();
     // walk down SeqCigar array and Alignment Array - optionally restricted by
     // selected region.
     // test group membership for each sequence in each group, store membership
index 9acfc24..98be0c8 100755 (executable)
  */
 package jalview.schemes;
 
+import jalview.analysis.scoremodels.PIDScoreModel;
+import jalview.api.analysis.ScoreModelI;
+
 import java.util.*;
 import java.util.List;
-
 import java.awt.*;
 
 public class ResidueProperties
 {
-  public static Hashtable scoreMatrices = new Hashtable();
+  public static Hashtable<String,ScoreModelI> scoreMatrices = new Hashtable();
 
   // Stores residue codes/names and colours and other things
   public static final int[] aaIndex; // aaHash version 2.1.1 and below
@@ -1416,6 +1418,10 @@ public class ResidueProperties
     propHash.put("proline", proline);
     propHash.put("polar", polar);
   }
+  static
+  {
+    scoreMatrices.put("PID", new PIDScoreModel());
+  }
 
   private ResidueProperties()
   {
@@ -1540,12 +1546,22 @@ public class ResidueProperties
   public static ScoreMatrix getScoreMatrix(String pwtype)
   {
     Object val = scoreMatrices.get(pwtype);
-    if (val != null)
+    if (val != null && val instanceof ScoreMatrix)
     {
       return (ScoreMatrix) val;
     }
     return null;
   }
+  /**
+   * get a ScoreModel based on its string name
+   * 
+   * @param pwtype
+   * @return scoremodel of type pwtype or null
+   */
+  public static ScoreModelI getScoreModel(String pwtype)
+  {
+    return scoreMatrices.get(pwtype);
+  }
 
   public static int getPAM250(char c, char d)
   {
index e78b92c..ab603e1 100644 (file)
  */
 package jalview.schemes;
 
-public class ScoreMatrix
+import jalview.analysis.scoremodels.PairwiseSeqScoreModel;
+import jalview.api.analysis.ScoreModelI;
+
+public class ScoreMatrix extends PairwiseSeqScoreModel implements ScoreModelI
 {
   String name;
+  
+  @Override
+  public String getName()
+  {
+    return name;
+  }
 
   /**
    * reference to integer score matrix
@@ -31,23 +40,31 @@ public class ScoreMatrix
    * 0 for Protein Score matrix. 1 for dna score matrix
    */
   int type;
-
+  /**
+   * 
+   * @param name Unique, human readable name for the matrix
+   * @param matrix Pairwise scores indexed according to appropriate symbol alphabet
+   * @param type 0 for Protein, 1 for NA
+   */
   ScoreMatrix(String name, int[][] matrix, int type)
   {
     this.matrix = matrix;
     this.type = type;
+    this.name = name;
   }
 
+  @Override
   public boolean isDNA()
   {
     return type == 1;
   }
-
+  @Override
   public boolean isProtein()
   {
     return type == 0;
   }
 
+  @Override
   public int[][] getMatrix()
   {
     return matrix;
index a472951..2830918 100644 (file)
@@ -18,6 +18,8 @@
  */
 package jalview.schemes;
 
+import jalview.api.analysis.ScoreModelI;
+
 import java.util.Map;
 
 import org.junit.Test;
@@ -28,7 +30,7 @@ public class ScoreMatrixPrinter
   @Test
   public void printAllMatrices()
   {
-    for (Map.Entry<String,ScoreMatrix> sm:((Map<String, ScoreMatrix>) ResidueProperties.scoreMatrices).entrySet())
+    for (Map.Entry<String,ScoreModelI> sm: ResidueProperties.scoreMatrices.entrySet())
     {
       System.out.println("Matrix "+sm.getKey());
       System.out.println(sm.getValue().toString());
@@ -37,10 +39,14 @@ public class ScoreMatrixPrinter
   @Test
   public void printHTMLMatrices()
   {
-    for (Map.Entry<String,ScoreMatrix> sm:((Map<String, ScoreMatrix>) ResidueProperties.scoreMatrices).entrySet())
+    for (Map.Entry<String,ScoreModelI> _sm: ResidueProperties.scoreMatrices.entrySet())
     {
-      System.out.println("Matrix "+sm.getKey());
-      System.out.println(sm.getValue().outputMatrix(true));
+      if (_sm.getValue() instanceof ScoreMatrix)
+      {
+        ScoreMatrix sm = (ScoreMatrix) _sm.getValue();
+        System.out.println("Matrix "+_sm.getKey());
+        System.out.println(sm.outputMatrix(true));
+      }
     }
   }