From 4724cafdcdcc4f3acc77c09261afe56f518a1e87 Mon Sep 17 00:00:00 2001 From: gmungoc Date: Fri, 17 Feb 2017 13:25:42 +0000 Subject: [PATCH] JAL-2403 extract method refactoring of FeatureScoreModel + test --- .../analysis/scoremodels/FeatureScoreModel.java | 152 ++++++++++++-------- .../scoremodels/FeatureScoreModelTest.java | 66 +++++++++ 2 files changed, 160 insertions(+), 58 deletions(-) diff --git a/src/jalview/analysis/scoremodels/FeatureScoreModel.java b/src/jalview/analysis/scoremodels/FeatureScoreModel.java index 7c81912..6da55c3 100644 --- a/src/jalview/analysis/scoremodels/FeatureScoreModel.java +++ b/src/jalview/analysis/scoremodels/FeatureScoreModel.java @@ -26,9 +26,11 @@ import jalview.datamodel.AlignmentView; import jalview.datamodel.SeqCigar; import jalview.datamodel.SequenceFeature; -import java.util.ArrayList; -import java.util.Hashtable; +import java.util.HashMap; +import java.util.HashSet; import java.util.List; +import java.util.Map; +import java.util.Set; public class FeatureScoreModel implements ScoreModelI, ViewBasedAnalysisI { @@ -42,88 +44,66 @@ public class FeatureScoreModel implements ScoreModelI, ViewBasedAnalysisI return true; } + /** + * Calculates a distance measure [i][j] between each pair of sequences as the + * average number of features they have but do not share. That is, find the + * features each sequence pair has at each column, ignore feature types they + * have in common, and count the rest. The totals are normalised by the number + * of columns processed. + */ @Override public float[][] findDistances(AlignmentView seqData) { - int nofeats = 0; List dft = fr.getDisplayedFeatureTypes(); - nofeats = dft.size(); SeqCigar[] seqs = seqData.getSequences(); int noseqs = seqs.length; int cpwidth = 0;// = seqData.getWidth(); float[][] distance = new float[noseqs][noseqs]; - if (nofeats == 0) + if (dft.isEmpty()) { - for (float[] d : distance) - { - for (int i = 0; i < d.length; d[i++] = 0f) - { - ; - } - } return distance; } + // need to get real position for view position int[] viscont = seqData.getVisibleContigs(); + + /* + * scan each column, compute and add to each distance[i, j] + * the number of feature types that seqi and seqj do not share + */ for (int vc = 0; vc < viscont.length; vc += 2) { - for (int cpos = viscont[vc]; cpos <= viscont[vc + 1]; cpos++) { cpwidth++; - // get visible features at cpos under view's display settings and - // compare them - List> sfap = new ArrayList>(); - for (int i = 0; i < noseqs; i++) - { - Hashtable types = new Hashtable(); - int spos = seqs[i].findPosition(cpos); - if (spos != -1) - { - List sfs = fr.findFeaturesAtRes( - seqs[i].getRefSeq(), spos); - for (SequenceFeature sf : sfs) - { - types.put(sf.getType(), sf); - } - } - sfap.add(types); - } + + /* + * first pass: record features types in column for each sequence + */ + Map> sfap = findFeatureTypesAtColumn( + seqs, cpos); + + /* + * count feature types on either i'th or j'th sequence but not both + * and add this 'distance' measure to the total for [i, j] for j > i + */ for (int i = 0; i < (noseqs - 1); i++) { - if (cpos == 0) - { - distance[i][i] = 0f; - } for (int j = i + 1; j < noseqs; j++) { - int sfcommon = 0; - // compare the two lists of features... - Hashtable fi = sfap.get(i), fk, fj = sfap - .get(j); - if (fi.size() > fj.size()) - { - fk = fj; - } - else - { - fk = fi; - fi = fj; - } - for (String k : fi.keySet()) - { - SequenceFeature sfj = fk.get(k); - if (sfj != null) - { - sfcommon++; - } - } - distance[i][j] += (fi.size() + fk.size() - 2f * sfcommon); - distance[j][i] += distance[i][j]; + int seqDistance = countUnsharedFeatureTypes(sfap.get(seqs[i]), + sfap.get(seqs[j])); + distance[i][j] += seqDistance; + // distance[j][i] += distance[i][j]; } } } } + + /* + * normalise the distance scores (summed over columns) by the + * number of visible columns used in the calculation + */ for (int i = 0; i < noseqs; i++) { for (int j = i + 1; j < noseqs; j++) @@ -135,6 +115,62 @@ public class FeatureScoreModel implements ScoreModelI, ViewBasedAnalysisI return distance; } + /** + * Returns the count of values that are set1 or set2 but not in both + * + * @param set1 + * @param set2 + * @return + */ + protected int countUnsharedFeatureTypes(Set set1, Set set2) + { + int size1 = set1.size(); + int size2 = set2.size(); + Set smallerSet = size1 < size2 ? set1 : set2; + Set largerSet = (smallerSet == set1 ? set2 : set1); + int inCommon = 0; + for (String k : smallerSet) + { + if (largerSet.contains(k)) + { + inCommon++; + } + } + + int notInCommon = (size1 - inCommon) + (size2 - inCommon); + return notInCommon; + } + + /** + * Builds and returns a list (one per SeqCigar) of visible feature types at + * the given column position + * + * @param seqs + * @param columnPosition + * @return + */ + protected Map> findFeatureTypesAtColumn( + SeqCigar[] seqs, int columnPosition) + { + Map> sfap = new HashMap>(); + for (SeqCigar seq : seqs) + { + Set types = new HashSet(); + int spos = seq.findPosition(columnPosition); + if (spos != -1) + { + List sfs = fr.findFeaturesAtRes(seq.getRefSeq(), + spos); + for (SequenceFeature sf : sfs) + { + types.add(sf.getType()); + } + } + sfap.put(seq, types); + } + return sfap; + } + @Override public String getName() { diff --git a/test/jalview/analysis/scoremodels/FeatureScoreModelTest.java b/test/jalview/analysis/scoremodels/FeatureScoreModelTest.java index 66eb5a5..14254f7 100644 --- a/test/jalview/analysis/scoremodels/FeatureScoreModelTest.java +++ b/test/jalview/analysis/scoremodels/FeatureScoreModelTest.java @@ -20,6 +20,8 @@ */ package jalview.analysis.scoremodels; +import static org.testng.Assert.assertEquals; + import jalview.datamodel.AlignmentI; import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; @@ -52,6 +54,17 @@ public class FeatureScoreModelTest int[] sf3 = new int[] { -1, -1, -1, -1, -1, -1, 76, 77 }; + /** + *
+   * Load test alignment and add features to sequences: 
+   *      FER1_MESCR FER1_SPIOL FER3_RAPSA FER1_MAIZE 
+   *  sf1     X          X          X  
+   *  sf2                X                     X 
+   *  sf3                                      X
+   * 
+ * + * @return + */ public AlignFrame getTestAlignmentFrame() { AlignFrame alf = new FileLoader(false).LoadFileWaitTillLoaded( @@ -97,6 +110,7 @@ public class FeatureScoreModelTest Assert.assertTrue(fsm.configureFromAlignmentView(alf.getCurrentView() .getAlignPanel())); alf.selectAllSequenceMenuItem_actionPerformed(null); + float[][] dm = fsm.findDistances(alf.getViewport().getAlignmentView( true)); Assert.assertTrue(dm[0][2] == 0f, @@ -190,4 +204,56 @@ public class FeatureScoreModelTest .size(), 0); } + @Test(groups = { "Functional" }) + public void testFindDistances() throws Exception + { + String seqs = ">s1\nABCDE\n>seq2\nABCDE\n"; + AlignFrame alf = new FileLoader().LoadFileWaitTillLoaded(seqs, + DataSourceType.PASTE); + SequenceI s1 = alf.getViewport().getAlignment().getSequenceAt(0); + SequenceI s2 = alf.getViewport().getAlignment().getSequenceAt(1); + + /* + * set domain and variant features thus: + * ----5 + * s1 ddd.. + * s1 .vvv. + * s1 ..vvv + * s2 .ddd. + * s2 vv..v + * The number of unshared feature types per column is + * 20120 (two features of the same type doesn't affect score) + * giving an average (pairwise distance) of 5/5 or 1.0 + */ + s1.addSequenceFeature(new SequenceFeature("domain", null, 1, 3, 0f, + null)); + s1.addSequenceFeature(new SequenceFeature("variant", null, 2, 4, 0f, + null)); + s1.addSequenceFeature(new SequenceFeature("variant", null, 3, 5, 0f, + null)); + s2.addSequenceFeature(new SequenceFeature("domain", null, 2, 4, 0f, + null)); + s2.addSequenceFeature(new SequenceFeature("variant", null, 1, 2, 0f, + null)); + s2.addSequenceFeature(new SequenceFeature("variant", null, 5, 5, 0f, + null)); + alf.setShowSeqFeatures(true); + alf.getFeatureRenderer().findAllFeatures(true); + + FeatureScoreModel fsm = new FeatureScoreModel(); + Assert.assertTrue(fsm.configureFromAlignmentView(alf.getCurrentView() + .getAlignPanel())); + alf.selectAllSequenceMenuItem_actionPerformed(null); + + float[][] distances = fsm.findDistances(alf.getViewport() + .getAlignmentView(true)); + assertEquals(distances.length, 2); + assertEquals(distances[0][0], 0f); + assertEquals(distances[1][1], 0f); + // these left to fail pending resolution of + // JAL-2424 (dividing score by 6, not 5) + assertEquals(distances[0][1], 1f); + assertEquals(distances[1][0], 1f); + } + } -- 1.7.10.2