JAL-2403 extract method refactoring of FeatureScoreModel + test
authorgmungoc <g.m.carstairs@dundee.ac.uk>
Fri, 17 Feb 2017 13:25:42 +0000 (13:25 +0000)
committergmungoc <g.m.carstairs@dundee.ac.uk>
Fri, 17 Feb 2017 13:25:42 +0000 (13:25 +0000)
src/jalview/analysis/scoremodels/FeatureScoreModel.java
test/jalview/analysis/scoremodels/FeatureScoreModelTest.java

index 7c81912..6da55c3 100644 (file)
@@ -26,9 +26,11 @@ import jalview.datamodel.AlignmentView;
 import jalview.datamodel.SeqCigar;
 import jalview.datamodel.SequenceFeature;
 
-import java.util.ArrayList;
-import java.util.Hashtable;
+import java.util.HashMap;
+import java.util.HashSet;
 import java.util.List;
+import java.util.Map;
+import java.util.Set;
 
 public class FeatureScoreModel implements ScoreModelI, ViewBasedAnalysisI
 {
@@ -42,88 +44,66 @@ public class FeatureScoreModel implements ScoreModelI, ViewBasedAnalysisI
     return true;
   }
 
+  /**
+   * Calculates a distance measure [i][j] between each pair of sequences as the
+   * average number of features they have but do not share. That is, find the
+   * features each sequence pair has at each column, ignore feature types they
+   * have in common, and count the rest. The totals are normalised by the number
+   * of columns processed.
+   */
   @Override
   public float[][] findDistances(AlignmentView seqData)
   {
-    int nofeats = 0;
     List<String> dft = fr.getDisplayedFeatureTypes();
-    nofeats = dft.size();
     SeqCigar[] seqs = seqData.getSequences();
     int noseqs = seqs.length;
     int cpwidth = 0;// = seqData.getWidth();
     float[][] distance = new float[noseqs][noseqs];
-    if (nofeats == 0)
+    if (dft.isEmpty())
     {
-      for (float[] d : distance)
-      {
-        for (int i = 0; i < d.length; d[i++] = 0f)
-        {
-          ;
-        }
-      }
       return distance;
     }
+
     // need to get real position for view position
     int[] viscont = seqData.getVisibleContigs();
+
+    /*
+     * scan each column, compute and add to each distance[i, j]
+     * the number of feature types that seqi and seqj do not share
+     */
     for (int vc = 0; vc < viscont.length; vc += 2)
     {
-
       for (int cpos = viscont[vc]; cpos <= viscont[vc + 1]; cpos++)
       {
         cpwidth++;
-        // get visible features at cpos under view's display settings and
-        // compare them
-        List<Hashtable<String, SequenceFeature>> sfap = new ArrayList<Hashtable<String, SequenceFeature>>();
-        for (int i = 0; i < noseqs; i++)
-        {
-          Hashtable<String, SequenceFeature> types = new Hashtable<String, SequenceFeature>();
-          int spos = seqs[i].findPosition(cpos);
-          if (spos != -1)
-          {
-            List<SequenceFeature> sfs = fr.findFeaturesAtRes(
-                    seqs[i].getRefSeq(), spos);
-            for (SequenceFeature sf : sfs)
-            {
-              types.put(sf.getType(), sf);
-            }
-          }
-          sfap.add(types);
-        }
+
+        /*
+         * first pass: record features types in column for each sequence
+         */
+        Map<SeqCigar, Set<String>> sfap = findFeatureTypesAtColumn(
+                seqs, cpos);
+
+        /*
+         * count feature types on either i'th or j'th sequence but not both
+         * and add this 'distance' measure to the total for [i, j] for j > i
+         */
         for (int i = 0; i < (noseqs - 1); i++)
         {
-          if (cpos == 0)
-          {
-            distance[i][i] = 0f;
-          }
           for (int j = i + 1; j < noseqs; j++)
           {
-            int sfcommon = 0;
-            // compare the two lists of features...
-            Hashtable<String, SequenceFeature> fi = sfap.get(i), fk, fj = sfap
-                    .get(j);
-            if (fi.size() > fj.size())
-            {
-              fk = fj;
-            }
-            else
-            {
-              fk = fi;
-              fi = fj;
-            }
-            for (String k : fi.keySet())
-            {
-              SequenceFeature sfj = fk.get(k);
-              if (sfj != null)
-              {
-                sfcommon++;
-              }
-            }
-            distance[i][j] += (fi.size() + fk.size() - 2f * sfcommon);
-            distance[j][i] += distance[i][j];
+            int seqDistance = countUnsharedFeatureTypes(sfap.get(seqs[i]),
+                    sfap.get(seqs[j]));
+            distance[i][j] += seqDistance;
+            // distance[j][i] += distance[i][j];
           }
         }
       }
     }
+
+    /*
+     * normalise the distance scores (summed over columns) by the
+     * number of visible columns used in the calculation
+     */
     for (int i = 0; i < noseqs; i++)
     {
       for (int j = i + 1; j < noseqs; j++)
@@ -135,6 +115,62 @@ public class FeatureScoreModel implements ScoreModelI, ViewBasedAnalysisI
     return distance;
   }
 
+  /**
+   * Returns the count of values that are set1 or set2 but not in both
+   * 
+   * @param set1
+   * @param set2
+   * @return
+   */
+  protected int countUnsharedFeatureTypes(Set<String> set1, Set<String> set2)
+  {
+    int size1 = set1.size();
+    int size2 = set2.size();
+    Set<String> smallerSet = size1 < size2 ? set1 : set2;
+    Set<String> largerSet = (smallerSet == set1 ? set2 : set1);
+    int inCommon = 0;
+    for (String k : smallerSet)
+    {
+      if (largerSet.contains(k))
+      {
+        inCommon++;
+      }
+    }
+
+    int notInCommon = (size1 - inCommon) + (size2 - inCommon);
+    return notInCommon;
+  }
+
+  /**
+   * Builds and returns a list (one per SeqCigar) of visible feature types at
+   * the given column position
+   * 
+   * @param seqs
+   * @param columnPosition
+   * @return
+   */
+  protected Map<SeqCigar, Set<String>> findFeatureTypesAtColumn(
+          SeqCigar[] seqs, int columnPosition)
+  {
+    Map<SeqCigar, Set<String>> sfap = new HashMap<SeqCigar, Set<String>>();
+    for (SeqCigar seq : seqs)
+    {
+      Set<String> types = new HashSet<String>();
+      int spos = seq.findPosition(columnPosition);
+      if (spos != -1)
+      {
+        List<SequenceFeature> sfs = fr.findFeaturesAtRes(seq.getRefSeq(),
+                spos);
+        for (SequenceFeature sf : sfs)
+        {
+          types.add(sf.getType());
+        }
+      }
+      sfap.put(seq, types);
+    }
+    return sfap;
+  }
+
   @Override
   public String getName()
   {
index 66eb5a5..14254f7 100644 (file)
@@ -20,6 +20,8 @@
  */
 package jalview.analysis.scoremodels;
 
+import static org.testng.Assert.assertEquals;
+
 import jalview.datamodel.AlignmentI;
 import jalview.datamodel.SequenceFeature;
 import jalview.datamodel.SequenceI;
@@ -52,6 +54,17 @@ public class FeatureScoreModelTest
 
   int[] sf3 = new int[] { -1, -1, -1, -1, -1, -1, 76, 77 };
 
+  /**
+   * <pre>
+   * Load test alignment and add features to sequences: 
+   *      FER1_MESCR FER1_SPIOL FER3_RAPSA FER1_MAIZE 
+   *  sf1     X          X          X  
+   *  sf2                X                     X 
+   *  sf3                                      X
+   * </pre>
+   * 
+   * @return
+   */
   public AlignFrame getTestAlignmentFrame()
   {
     AlignFrame alf = new FileLoader(false).LoadFileWaitTillLoaded(
@@ -97,6 +110,7 @@ public class FeatureScoreModelTest
     Assert.assertTrue(fsm.configureFromAlignmentView(alf.getCurrentView()
             .getAlignPanel()));
     alf.selectAllSequenceMenuItem_actionPerformed(null);
+
     float[][] dm = fsm.findDistances(alf.getViewport().getAlignmentView(
             true));
     Assert.assertTrue(dm[0][2] == 0f,
@@ -190,4 +204,56 @@ public class FeatureScoreModelTest
             .size(), 0);
   }
 
+  @Test(groups = { "Functional" })
+  public void testFindDistances() throws Exception
+  {
+    String seqs = ">s1\nABCDE\n>seq2\nABCDE\n";
+    AlignFrame alf = new FileLoader().LoadFileWaitTillLoaded(seqs,
+            DataSourceType.PASTE);
+    SequenceI s1 = alf.getViewport().getAlignment().getSequenceAt(0);
+    SequenceI s2 = alf.getViewport().getAlignment().getSequenceAt(1);
+
+    /*
+     * set domain and variant features thus:
+     *     ----5
+     *  s1 ddd..
+     *  s1 .vvv.
+     *  s1 ..vvv    
+     *  s2 .ddd. 
+     *  s2 vv..v
+     *  The number of unshared feature types per column is
+     *     20120 (two features of the same type doesn't affect score)
+     *  giving an average (pairwise distance) of 5/5 or 1.0 
+     */
+    s1.addSequenceFeature(new SequenceFeature("domain", null, 1, 3, 0f,
+            null));
+    s1.addSequenceFeature(new SequenceFeature("variant", null, 2, 4, 0f,
+            null));
+    s1.addSequenceFeature(new SequenceFeature("variant", null, 3, 5, 0f,
+            null));
+    s2.addSequenceFeature(new SequenceFeature("domain", null, 2, 4, 0f,
+            null));
+    s2.addSequenceFeature(new SequenceFeature("variant", null, 1, 2, 0f,
+            null));
+    s2.addSequenceFeature(new SequenceFeature("variant", null, 5, 5, 0f,
+            null));
+    alf.setShowSeqFeatures(true);
+    alf.getFeatureRenderer().findAllFeatures(true);
+
+    FeatureScoreModel fsm = new FeatureScoreModel();
+    Assert.assertTrue(fsm.configureFromAlignmentView(alf.getCurrentView()
+            .getAlignPanel()));
+    alf.selectAllSequenceMenuItem_actionPerformed(null);
+
+    float[][] distances = fsm.findDistances(alf.getViewport()
+            .getAlignmentView(true));
+    assertEquals(distances.length, 2);
+    assertEquals(distances[0][0], 0f);
+    assertEquals(distances[1][1], 0f);
+    // these left to fail pending resolution of
+    // JAL-2424 (dividing score by 6, not 5)
+    assertEquals(distances[0][1], 1f);
+    assertEquals(distances[1][0], 1f);
+  }
+
 }