From 613a7a5d1fe2758f0bb620e49eeb87fa45b5458f Mon Sep 17 00:00:00 2001 From: gmungoc Date: Mon, 27 Mar 2017 10:16:21 +0100 Subject: [PATCH] JAL-2416 use '-' not space for gap in score matrices --- resources/scoreModel/blosum62.scm | 10 +--- resources/scoreModel/dna.scm | 31 +++++----- resources/scoreModel/pam250.scm | 59 +++++++++----------- src/jalview/analysis/scoremodels/PIDModel.java | 2 +- src/jalview/analysis/scoremodels/ScoreMatrix.java | 11 ++-- .../analysis/scoremodels/ScoreMatrixTest.java | 31 ++++------ 6 files changed, 61 insertions(+), 83 deletions(-) diff --git a/resources/scoreModel/blosum62.scm b/resources/scoreModel/blosum62.scm index 2ea333c..b793e04 100644 --- a/resources/scoreModel/blosum62.scm +++ b/resources/scoreModel/blosum62.scm @@ -1,18 +1,14 @@ ScoreMatrix BLOSUM62 -ARNDCQEGHILKMFPSTWYVBZX * +ARNDCQEGHILKMFPSTWYVBZX-* # # The BLOSUM62 substitution matrix, as at https://www.ncbi.nlm.nih.gov/Class/FieldGuide/BLOSUM62.txt # The first line declares a ScoreMatrix with the name BLOSUM62 (shown in menus) -# The second line gives the symbols for which scores are held in the matrix -# These may include a space (but not as the first or last character) # # Scores are not symbol case sensitive, unless column(s) are provided for lower case characters # The 'guide symbol' at the start of each row of score values is optional -# -# Header line with symbols may be provided as a guide # Values may be integer or floating point, delimited by tab, space, comma or combinations # - A R N D C Q E G H I L K M F P S T W Y V B Z X * + A R N D C Q E G H I L K M F P S T W Y V B Z X - * # A 4 -1 -2 -2 0 -1 -1 0 -2 -1 -1 -1 -1 -2 -1 1 0 -3 -2 0 -2 -1 0 -4 -4 R -1 5 0 -2 -3 1 0 -2 0 -3 -2 2 -1 -3 -2 -1 -1 -3 -2 -3 -1 0 -1 -4 -4 @@ -37,7 +33,7 @@ V 0 -3 -3 -3 -1 -2 -2 -3 -3 3 1 -2 1 -1 -2 -2 0 -3 -1 4 -3 -2 -1 -4 -4 B -2 -1 3 4 -3 0 1 -1 0 -3 -4 0 -3 -3 -2 0 -1 -4 -3 -3 4 1 -1 -4 -4 Z -1 0 0 1 -3 3 4 -2 0 -3 -3 1 -1 -3 -1 0 -1 -3 -2 -2 1 4 -1 -4 -4 X 0 -1 -1 -1 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -2 0 0 -2 -1 -1 -1 -1 -1 -4 -4 - -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 1 1 +- -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 1 1 * -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 1 1 # # A R N D C Q E G H I L K M F P S T W Y V B Z X * diff --git a/resources/scoreModel/dna.scm b/resources/scoreModel/dna.scm index d32647b..e66864a 100644 --- a/resources/scoreModel/dna.scm +++ b/resources/scoreModel/dna.scm @@ -1,5 +1,5 @@ ScoreMatrix DNA -ACGTUIXRYN - +ACGTUIXRYN- # # A DNA substitution matrix. # This is an ad-hoc matrix which, in addition to penalising mutations between the common @@ -10,26 +10,21 @@ ACGTUIXRYN - # any of (ACGTU), and unfavourably match each other. # # The first line declares a ScoreMatrix with the name DNA (shown in menus) -# The second line gives the symbols for which scores are held in the matrix -# These may include a space (but not as the first or last character) # Scores are not case sensitive, unless column(s) are provided for lower case characters # -# -# Header line with symbols is provided as a guide # Values may be integer or floating point, delimited by tab, space, comma or combinations # - A C G T U I X R Y N - - 10 -8 -8 -8 -8 1 1 1 -8 1 1 1 - -8 10 -8 -8 -8 1 1 -8 1 1 1 1 - -8 -8 10 -8 -8 1 1 1 -8 1 1 1 - -8 -8 -8 10 10 1 1 -8 1 1 1 1 - -8 -8 -8 10 10 1 1 -8 1 1 1 1 - 1 1 1 1 1 10 0 0 0 1 1 1 - 1 1 1 1 1 0 10 0 0 1 1 1 - 1 -8 1 -8 -8 0 0 10 -8 1 1 1 - -8 1 -8 1 1 0 0 -8 10 1 1 1 - 1 1 1 1 1 1 1 1 1 10 1 1 - 1 1 1 1 1 1 1 1 1 1 1 1 - 1 1 1 1 1 1 1 1 1 1 1 1 + A C G T U I X R Y N - +A 10 -8 -8 -8 -8 1 1 1 -8 1 1 +C -8 10 -8 -8 -8 1 1 -8 1 1 1 +G -8 -8 10 -8 -8 1 1 1 -8 1 1 +T -8 -8 -8 10 10 1 1 -8 1 1 1 +U -8 -8 -8 10 10 1 1 -8 1 1 1 +I 1 1 1 1 1 10 0 0 0 1 1 +X 1 1 1 1 1 0 10 0 0 1 1 +R 1 -8 1 -8 -8 0 0 10 -8 1 1 +Y -8 1 -8 1 1 0 0 -8 10 1 1 +N 1 1 1 1 1 1 1 1 1 10 1 +- 1 1 1 1 1 1 1 1 1 1 1 # # A C G T U I X R Y N - diff --git a/resources/scoreModel/pam250.scm b/resources/scoreModel/pam250.scm index 02903ad..05404d0 100644 --- a/resources/scoreModel/pam250.scm +++ b/resources/scoreModel/pam250.scm @@ -1,41 +1,36 @@ ScoreMatrix PAM250 -ARNDCQEGHILKMFPSTWYVBZX * +ARNDCQEGHILKMFPSTWYVBZX-* # # The PAM250 substitution matrix # The first line declares a ScoreMatrix with the name PAM250 (shown in menus) -# The second line gives the symbols for which scores are held in the matrix -# These may include a space (but not as the first or last character) # Scores are not case sensitive, unless column(s) are provided for lower case characters -# -# -# Header line with symbols is provided as a guide # Values may be integer or floating point, delimited by tab, space, comma or combinations # -# A R N D C Q E G H I L K M F P S T W Y V B Z X * - 2 -2 0 0 -2 0 0 1 -1 -1 -2 -1 -1 -3 1 1 1 -6 -3 0 0 0 0 -8 -8 - -2 6 0 -1 -4 1 -1 -3 2 -2 -3 3 0 -4 0 0 -1 2 -4 -2 -1 0 -1 -8 -8 - 0 0 2 2 -4 1 1 0 2 -2 -3 1 -2 -3 0 1 0 -4 -2 -2 2 1 0 -8 -8 - 0 -1 2 4 -5 2 3 1 1 -2 -4 0 -3 -6 -1 0 0 -7 -4 -2 3 3 -1 -8 -8 - -2 -4 -4 -5 12 -5 -5 -3 -3 -2 -6 -5 -5 -4 -3 0 -2 -8 0 -2 -4 -5 -3 -8 -8 - 0 1 1 2 -5 4 2 -1 3 -2 -2 1 -1 -5 0 -1 -1 -5 -4 -2 1 3 -1 -8 -8 - 0 -1 1 3 -5 2 4 0 1 -2 -3 0 -2 -5 -1 0 0 -7 -4 -2 3 3 -1 -8 -8 - 1 -3 0 1 -3 -1 0 5 -2 -3 -4 -2 -3 -5 0 1 0 -7 -5 -1 0 0 -1 -8 -8 - -1 2 2 1 -3 3 1 -2 6 -2 -2 0 -2 -2 0 -1 -1 -3 0 -2 1 2 -1 -8 -8 - -1 -2 -2 -2 -2 -2 -2 -3 -2 5 2 -2 2 1 -2 -1 0 -5 -1 4 -2 -2 -1 -8 -8 - -2 -3 -3 -4 -6 -2 -3 -4 -2 2 6 -3 4 2 -3 -3 -2 -2 -1 2 -3 -3 -1 -8 -8 - -1 3 1 0 -5 1 0 -2 0 -2 -3 5 0 -5 -1 0 0 -3 -4 -2 1 0 -1 -8 -8 - -1 0 -2 -3 -5 -1 -2 -3 -2 2 4 0 6 0 -2 -2 -1 -4 -2 2 -2 -2 -1 -8 -8 - -3 -4 -3 -6 -4 -5 -5 -5 -2 1 2 -5 0 9 -5 -3 -3 0 7 -1 -4 -5 -2 -8 -8 - 1 0 0 -1 -3 0 -1 0 0 -2 -3 -1 -2 -5 6 1 0 -6 -5 -1 -1 0 -1 -8 -8 - 1 0 1 0 0 -1 0 1 -1 -1 -3 0 -2 -3 1 2 1 -2 -3 -1 0 0 0 -8 -8 - 1 -1 0 0 -2 -1 0 0 -1 0 -2 0 -1 -3 0 1 3 -5 -3 0 0 -1 0 -8 -8 - -6 2 -4 -7 -8 -5 -7 -7 -3 -5 -2 -3 -4 0 -6 -2 -5 17 0 -6 -5 -6 -4 -8 -8 - -3 -4 -2 -4 0 -4 -4 -5 0 -1 -1 -4 -2 7 -5 -3 -3 0 10 -2 -3 -4 -2 -8 -8 - 0 -2 -2 -2 -2 -2 -2 -1 -2 4 2 -2 2 -1 -1 -1 0 -6 -2 4 -2 -2 -1 -8 -8 - 0 -1 2 3 -4 1 3 0 1 -2 -3 1 -2 -4 -1 0 0 -5 -3 -2 3 2 -1 -8 -8 - 0 0 1 3 -5 3 3 0 2 -2 -3 0 -2 -5 0 0 -1 -6 -4 -2 2 3 -1 -8 -8 - 0 -1 0 -1 -3 -1 -1 -1 -1 -1 -1 -1 -1 -2 -1 0 0 -4 -2 -1 -1 -1 -1 -8 -8 - -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 1 1 - -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 1 1 +# A R N D C Q E G H I L K M F P S T W Y V B Z X - * +A 2 -2 0 0 -2 0 0 1 -1 -1 -2 -1 -1 -3 1 1 1 -6 -3 0 0 0 0 -8 -8 +R -2 6 0 -1 -4 1 -1 -3 2 -2 -3 3 0 -4 0 0 -1 2 -4 -2 -1 0 -1 -8 -8 +N 0 0 2 2 -4 1 1 0 2 -2 -3 1 -2 -3 0 1 0 -4 -2 -2 2 1 0 -8 -8 +D 0 -1 2 4 -5 2 3 1 1 -2 -4 0 -3 -6 -1 0 0 -7 -4 -2 3 3 -1 -8 -8 +C -2 -4 -4 -5 12 -5 -5 -3 -3 -2 -6 -5 -5 -4 -3 0 -2 -8 0 -2 -4 -5 -3 -8 -8 +Q 0 1 1 2 -5 4 2 -1 3 -2 -2 1 -1 -5 0 -1 -1 -5 -4 -2 1 3 -1 -8 -8 +E 0 -1 1 3 -5 2 4 0 1 -2 -3 0 -2 -5 -1 0 0 -7 -4 -2 3 3 -1 -8 -8 +G 1 -3 0 1 -3 -1 0 5 -2 -3 -4 -2 -3 -5 0 1 0 -7 -5 -1 0 0 -1 -8 -8 +H -1 2 2 1 -3 3 1 -2 6 -2 -2 0 -2 -2 0 -1 -1 -3 0 -2 1 2 -1 -8 -8 +I -1 -2 -2 -2 -2 -2 -2 -3 -2 5 2 -2 2 1 -2 -1 0 -5 -1 4 -2 -2 -1 -8 -8 +L -2 -3 -3 -4 -6 -2 -3 -4 -2 2 6 -3 4 2 -3 -3 -2 -2 -1 2 -3 -3 -1 -8 -8 +K -1 3 1 0 -5 1 0 -2 0 -2 -3 5 0 -5 -1 0 0 -3 -4 -2 1 0 -1 -8 -8 +M -1 0 -2 -3 -5 -1 -2 -3 -2 2 4 0 6 0 -2 -2 -1 -4 -2 2 -2 -2 -1 -8 -8 +F -3 -4 -3 -6 -4 -5 -5 -5 -2 1 2 -5 0 9 -5 -3 -3 0 7 -1 -4 -5 -2 -8 -8 +P 1 0 0 -1 -3 0 -1 0 0 -2 -3 -1 -2 -5 6 1 0 -6 -5 -1 -1 0 -1 -8 -8 +S 1 0 1 0 0 -1 0 1 -1 -1 -3 0 -2 -3 1 2 1 -2 -3 -1 0 0 0 -8 -8 +T 1 -1 0 0 -2 -1 0 0 -1 0 -2 0 -1 -3 0 1 3 -5 -3 0 0 -1 0 -8 -8 +W -6 2 -4 -7 -8 -5 -7 -7 -3 -5 -2 -3 -4 0 -6 -2 -5 17 0 -6 -5 -6 -4 -8 -8 +Y -3 -4 -2 -4 0 -4 -4 -5 0 -1 -1 -4 -2 7 -5 -3 -3 0 10 -2 -3 -4 -2 -8 -8 +V 0 -2 -2 -2 -2 -2 -2 -1 -2 4 2 -2 2 -1 -1 -1 0 -6 -2 4 -2 -2 -1 -8 -8 +B 0 -1 2 3 -4 1 3 0 1 -2 -3 1 -2 -4 -1 0 0 -5 -3 -2 3 2 -1 -8 -8 +Z 0 0 1 3 -5 3 3 0 2 -2 -3 0 -2 -5 0 0 -1 -6 -4 -2 2 3 -1 -8 -8 +X 0 -1 0 -1 -3 -1 -1 -1 -1 -1 -1 -1 -1 -2 -1 0 0 -4 -2 -1 -1 -1 -1 -8 -8 +- -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 1 1 +* -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 1 1 # # A R N D C Q E G H I L K M F P S T W Y V B Z X * diff --git a/src/jalview/analysis/scoremodels/PIDModel.java b/src/jalview/analysis/scoremodels/PIDModel.java index 9500df4..a3358aa 100644 --- a/src/jalview/analysis/scoremodels/PIDModel.java +++ b/src/jalview/analysis/scoremodels/PIDModel.java @@ -65,7 +65,7 @@ public class PIDModel implements SimilarityScoreModelI, public MatrixI findSimilarities(AlignmentView seqData, SimilarityParamsI options) { - String[] seqs = seqData.getSequenceStrings(' '); + String[] seqs = seqData.getSequenceStrings(Comparison.GAP_DASH); return findSimilarities(seqs, options); } diff --git a/src/jalview/analysis/scoremodels/ScoreMatrix.java b/src/jalview/analysis/scoremodels/ScoreMatrix.java index a4b8343..6a74dfc 100644 --- a/src/jalview/analysis/scoremodels/ScoreMatrix.java +++ b/src/jalview/analysis/scoremodels/ScoreMatrix.java @@ -33,6 +33,8 @@ import java.util.Arrays; public class ScoreMatrix implements SimilarityScoreModelI, PairwiseScoreModelI { + private static final char GAP_CHARACTER = Comparison.GAP_DASH; + /* * Jalview 2.10.1 treated gaps as X (peptide) or N (nucleotide) * for pairwise scoring; 2.10.2 uses gap score (last column) in @@ -350,7 +352,8 @@ public class ScoreMatrix implements SimilarityScoreModelI, public MatrixI findSimilarities(AlignmentView seqstrings, SimilarityParamsI options) { - char gapChar = scoreGapAsAny ? (seqstrings.isNa() ? 'N' : 'X') : ' '; + char gapChar = scoreGapAsAny ? (seqstrings.isNa() ? 'N' : 'X') + : Comparison.GAP_DASH; String[] seqs = seqstrings.getSequenceStrings(gapChar); return findSimilarities(seqs, options); } @@ -408,9 +411,9 @@ public class ScoreMatrix implements SimilarityScoreModelI, break; } } - // Change GAP_SPACE to GAP_DASH if we adopt - for gap in matrices - char c1 = i >= len1 ? Comparison.GAP_SPACE : seq1.charAt(i); - char c2 = i >= len2 ? Comparison.GAP_SPACE : seq2.charAt(i); + + char c1 = i >= len1 ? GAP_CHARACTER : seq1.charAt(i); + char c2 = i >= len2 ? GAP_CHARACTER : seq2.charAt(i); boolean gap1 = Comparison.isGap(c1); boolean gap2 = Comparison.isGap(c2); diff --git a/test/jalview/analysis/scoremodels/ScoreMatrixTest.java b/test/jalview/analysis/scoremodels/ScoreMatrixTest.java index 5051bf6..5c699d1 100644 --- a/test/jalview/analysis/scoremodels/ScoreMatrixTest.java +++ b/test/jalview/analysis/scoremodels/ScoreMatrixTest.java @@ -148,10 +148,10 @@ public class ScoreMatrixTest assertEquals(sm.getMatrixIndex('D'), 3); assertEquals(sm.getMatrixIndex('X'), 22); assertEquals(sm.getMatrixIndex('x'), 22); - assertEquals(sm.getMatrixIndex(' '), 23); + assertEquals(sm.getMatrixIndex('-'), 23); assertEquals(sm.getMatrixIndex('*'), 24); assertEquals(sm.getMatrixIndex('.'), -1); - assertEquals(sm.getMatrixIndex('-'), -1); + assertEquals(sm.getMatrixIndex(' '), -1); assertEquals(sm.getMatrixIndex('?'), -1); assertEquals(sm.getMatrixIndex((char) 128), -1); } @@ -167,10 +167,9 @@ public class ScoreMatrixTest public void testComputePairwiseScores() { /* - * NB score matrix assumes space for gap - Jalview converts - * space to gap before computing PCA or Tree + * NB score matrix expects '-' for gap */ - String[] seqs = new String[] { "FKL", "R D", "QIA", "GWC" }; + String[] seqs = new String[] { "FKL", "R-D", "QIA", "GWC" }; ScoreMatrix sm = ScoreModels.getInstance().getBlosum62(); MatrixI pairwise = sm.findSimilarities(seqs, SimilarityParams.Jalview); @@ -253,16 +252,11 @@ public class ScoreMatrixTest @Test(groups = "Functional") public void testcomputeSimilarity_matchLongestSequence() { - // TODO params.matchGaps() is not used for ScoreMatrix - // - includeGaps is sufficient (there is no denominator) - // ==> bespoke parameters only 3 booleans? /* - * for now, using space for gap to match callers of - * AlignmentView.getSequenceStrings() - * may change this to '-' (with corresponding change to matrices) + * ScoreMatrix expects '-' for gaps */ - String s1 = "FR K S"; - String s2 = "FS L"; + String s1 = "FR-K-S"; + String s2 = "FS--L"; ScoreMatrix blosum = ScoreModels.getInstance().getBlosum62(); /* @@ -318,16 +312,11 @@ public class ScoreMatrixTest @Test(groups = "Functional") public void testcomputeSimilarity_matchShortestSequence() { - // TODO params.matchGaps() is not used for ScoreMatrix - // - includeGaps is sufficient (there is no denominator) - // ==> bespoke parameters only 3 booleans? /* - * for now, using space for gap to match callers of - * AlignmentView.getSequenceStrings() - * may change this to '-' (with corresponding change to matrices) + * ScoreMatrix expects '-' for gaps */ - String s1 = "FR K S"; - String s2 = "FS L"; + String s1 = "FR-K-S"; + String s2 = "FS--L"; ScoreMatrix blosum = ScoreModels.getInstance().getBlosum62(); /* -- 1.7.10.2