From 6fc54333dcca130b378144b067a9a016c9e1cc41 Mon Sep 17 00:00:00 2001 From: Ben Soares Date: Sun, 13 Nov 2022 00:53:27 +0000 Subject: [PATCH] JAL-4019 improvement to short sequence nucleotide detection, and some tidying of unused parameters --- src/jalview/schemes/ResidueProperties.java | 15 +-- src/jalview/util/Comparison.java | 158 +++++++++++++++++++++++----- 2 files changed, 139 insertions(+), 34 deletions(-) diff --git a/src/jalview/schemes/ResidueProperties.java b/src/jalview/schemes/ResidueProperties.java index 73b48a9..2ce3e6d 100755 --- a/src/jalview/schemes/ResidueProperties.java +++ b/src/jalview/schemes/ResidueProperties.java @@ -48,6 +48,8 @@ public class ResidueProperties public static final Map nucleotideName = new HashMap<>(); + public static final Map nucleotideAmbiguityName = new HashMap<>(); + // lookup from modified amino acid (e.g. MSE) to canonical form (e.g. MET) public static final Map modifications = new HashMap<>(); @@ -123,16 +125,14 @@ public class ResidueProperties static { - - String[][] namesArray = { { "a", "Adenine" }, { "g", "Guanine" }, - { "c", "Cytosine" }, + String[][] namesArray = { { "a", "Adenine" }, { "c", "Cytosine" }, + { "g", "Guanine" }, { "t", "Thymine" }, { "u", "Uracil" }, { "i", "Inosine" }, { "x", "Xanthine" }, { "r", "Unknown Purine" }, { "y", "Unknown Pyrimidine" }, - { "n", "Unknown" }, { "w", "Weak nucleotide (A or T)" }, { "s", "Strong nucleotide (G or C)" }, { "m", "Amino (A or C)" }, @@ -140,10 +140,10 @@ public class ResidueProperties { "b", "Not A (G or C or T)" }, { "h", "Not G (A or C or T)" }, { "d", "Not C (A or G or T)" }, - { "v", "Not T (A or G or C" } }; - + { "v", "Not T (A or G or C" }, + { "n", "Unknown" } }; // "gap" index - maxNucleotideIndex = namesArray.length + 1; + maxNucleotideIndex = namesArray.length; nucleotideIndex = new int[255]; for (int i = 0; i < 255; i++) @@ -162,6 +162,7 @@ public class ResidueProperties nucleotideName.put(namesArray[i][0].toUpperCase(Locale.ROOT), namesArray[i][1]); } + } static diff --git a/src/jalview/util/Comparison.java b/src/jalview/util/Comparison.java index 2dcbeb5..af6052c 100644 --- a/src/jalview/util/Comparison.java +++ b/src/jalview/util/Comparison.java @@ -23,6 +23,8 @@ package jalview.util; import java.util.ArrayList; import java.util.List; +import jalview.bin.Cache; +import jalview.bin.Console; import jalview.datamodel.SequenceI; /** @@ -32,9 +34,13 @@ public class Comparison { private static final int EIGHTY_FIVE = 85; - private static final int NINETY_NINE = 99; + private static final int NUCLEOTIDE_COUNT_PERCENT; - private static final int TO_UPPER_CASE = 'a' - 'A'; + private static final int NUCLEOTIDE_COUNT_LONG_SEQUENCE_AMBIGUITY_PERCENT; + + private static final int NUCLEOTIDE_COUNT_SHORT_SEQUENCE; + + private static final int NUCLEOTIDE_COUNT_VERY_SHORT_SEQUENCE; public static final char GAP_SPACE = ' '; @@ -46,6 +52,18 @@ public class Comparison new char[] { GAP_SPACE, GAP_DOT, GAP_DASH }); + static + { + NUCLEOTIDE_COUNT_PERCENT = Cache.getDefault("NUCLEOTIDE_COUNT_PERCENT", + 55); + NUCLEOTIDE_COUNT_LONG_SEQUENCE_AMBIGUITY_PERCENT = Cache.getDefault( + "NUCLEOTIDE_COUNT_LONG_SEQUENCE_AMBIGUITY_PERCENT", 95); + NUCLEOTIDE_COUNT_SHORT_SEQUENCE = Cache + .getDefault("NUCLEOTIDE_COUNT_SHORT", 100); + NUCLEOTIDE_COUNT_VERY_SHORT_SEQUENCE = Cache + .getDefault("NUCLEOTIDE_COUNT_VERY_SHORT", 4); + } + /** * DOCUMENT ME! * @@ -93,7 +111,6 @@ public class Comparison jlen--; } - int count = 0; int match = 0; float pid = -1; @@ -106,8 +123,6 @@ public class Comparison { match++; } - - count++; } pid = (float) match / (float) ilen * 100; @@ -121,8 +136,6 @@ public class Comparison { match++; } - - count++; } pid = (float) match / (float) jlen * 100; @@ -270,14 +283,16 @@ public class Comparison */ public static final boolean isNucleotide(SequenceI seq) { - if (seq == null) + if (seq == null || seq.getLength() == 0) { return false; } - long ntCount = 0; - long aaCount = 0; - long nCount = 0; - long ntaCount = 0; + long ntCount = 0; // nucleotide symbol count (does not include ntaCount) + long aaCount = 0; // non-nucleotide, non-gap symbol count (includes nCount + // and ntaCount) + long nCount = 0; // "Unknown" (N) symbol count + long xCount = 0; // Also used as "Unknown" (X) symbol count + long ntaCount = 0; // nucleotide ambiguity symbol count int len = seq.getLength(); for (int i = 0; i < len; i++) @@ -296,6 +311,10 @@ public class Comparison } else { + if (isX(c)) + { + xCount++; + } if (isNucleotideAmbiguity(c)) { ntaCount++; @@ -303,25 +322,98 @@ public class Comparison } } } - /* - * Check for nucleotide count > 85% of total count (in a form that evades - * int / float conversion or divide by zero). - */ - if ((ntCount + nCount) * 100 > EIGHTY_FIVE * (ntCount + aaCount)) + long allCount = ntCount + aaCount; + + if (Cache.getDefault("NUCLEOTIDE_AMBIGUITY_DETECTION", true)) { - return ntCount > 0; // all N is considered protein. Could use a threshold - // here too + Console.info("Performing new nucleotide detection routine"); + if (allCount > NUCLEOTIDE_COUNT_SHORT_SEQUENCE) + { + // a long sequence. + // check for at least 55% nucleotide, and nucleotide and ambiguity codes + // (including N) must make up 95% + return ntCount * 100 > NUCLEOTIDE_COUNT_PERCENT * allCount + && 100 * (ntCount + nCount + + ntaCount) > NUCLEOTIDE_COUNT_LONG_SEQUENCE_AMBIGUITY_PERCENT + * allCount; + } + else if (allCount > NUCLEOTIDE_COUNT_VERY_SHORT_SEQUENCE) + { + // a short sequence. + // check if a short sequence is at least 55% nucleotide and the rest of + // the symbols are all X or all N + if (ntCount * 100 > NUCLEOTIDE_COUNT_PERCENT * allCount + && (nCount == aaCount || xCount == aaCount)) + { + return true; + } + + // a short sequence. + // check for at least x% nucleotide and all the rest nucleotide + // ambiguity codes (including N), where x slides from 75% for sequences + // of length 4 (i.e. only one non-nucleotide) to 55% for sequences of + // length 100 + return myShortSequenceNucleotideProportionCount(ntCount, allCount) + && nCount + ntaCount == aaCount; + } + else + { + // a very short sequence. (<4) + // all bases must be nucleotide + return ntCount > 0 && ntCount == allCount; + } } else { - // check for very large proportion of nucleotide and all ambiguity codes - if ((ntCount + nCount + ntaCount) * 100 >= NINETY_NINE - * (ntCount + aaCount)) + Console.info("Performing old nucleotide detection routine"); + /* + * Check for nucleotide count > 85% of total count (in a form that evades + * int / float conversion or divide by zero). + */ + if ((ntCount + nCount) * 100 > EIGHTY_FIVE * allCount) { - return ntCount > 0; + return ntCount > 0; // all N is considered protein. Could use a + // threshold here too } - return false; } + return false; + } + + protected static boolean myShortSequenceNucleotideProportionCount( + long ntCount, long allCount) + { + /** + * this method is valid only for NUCLEOTIDE_COUNT_VERY_SHORT_SEQUENCE <= + * allCount <= NUCLEOTIDE_COUNT_SHORT_SEQUENCE + */ + // the following is a simplified integer version of: + // + // a := allCount # the number of bases in the sequence + // n : = ntCount # the number of definite nucleotide bases + // vs := NUCLEOTIDE_COUNT_VERY_SHORT_SEQUENCE + // s := NUCLEOTIDE_COUNT_SHORT_SEQUENCE + // lp := NUCLEOTIDE_COUNT_LOWER_PERCENT + // vsp := 1 - (1/a) # this is the proportion of required nucleotides in + // # a VERY_SHORT Sequence (4 bases). + // # should be all but one base is nucleotide. + // p := (a - vs)/(s - vs) # proportion of the way between + // # VERY_SHORT and SHORT thresholds. + // tp := vsp + p * (lp/100 - vsp) # the proportion of nucleotides + // # required for this length of sequence. + // minNt := tp * a # the minimum number of definite nucleotide bases + // # required for this length of sequences. + // + // We are then essentially returning: + // # ntCount >= 55% of allCount and the rest are all nucleotide ambiguity + // ntCount >= tp * allCount && nCount + ntaCount == aaCount + // but without going into float/double land + long LHS = 100 * allCount + * (NUCLEOTIDE_COUNT_SHORT_SEQUENCE + - NUCLEOTIDE_COUNT_VERY_SHORT_SEQUENCE) + * (ntCount - allCount + 1); + long RHS = allCount * (allCount - NUCLEOTIDE_COUNT_VERY_SHORT_SEQUENCE) + * (allCount * NUCLEOTIDE_COUNT_PERCENT - 100 * allCount + 100); + return LHS >= RHS; } /** @@ -369,7 +461,10 @@ public class Comparison return isNucleotide(c, false); } - public static boolean isNucleotide(char c, boolean countAmbiguity) + /** + * includeAmbiguity = true also includes Nucleotide Ambiguity codes + */ + public static boolean isNucleotide(char c, boolean includeAmbiguity) { char C = Character.toUpperCase(c); switch (C) @@ -381,7 +476,7 @@ public class Comparison case 'U': return true; } - if (countAmbiguity) + if (includeAmbiguity) { boolean ambiguity = isNucleotideAmbiguity(C); if (ambiguity) @@ -390,6 +485,9 @@ public class Comparison return false; } + /** + * Tests *only* nucleotide ambiguity codes (and not definite nucleotide codes) + */ public static boolean isNucleotideAmbiguity(char c) { switch (Character.toUpperCase(c)) @@ -432,6 +530,12 @@ public class Comparison */ public static boolean isNucleotideSequence(String s, boolean allowGaps) { + return isNucleotideSequence(s, allowGaps, false); + } + + public static boolean isNucleotideSequence(String s, boolean allowGaps, + boolean includeAmbiguous) + { if (s == null) { return false; @@ -439,7 +543,7 @@ public class Comparison for (int i = 0; i < s.length(); i++) { char c = s.charAt(i); - if (!isNucleotide(c)) + if (!isNucleotide(c, includeAmbiguous)) { if (!allowGaps || !isGap(c)) { -- 1.7.10.2