<mapID target="colours.turn" url="html/colourSchemes/turn.html" />
<mapID target="colours.buried" url="html/colourSchemes/buried.html" />
<mapID target="colours.nucleotide" url="html/colourSchemes/nucleotide.html" />
+ <mapID target="colours.nucleotideambiguity" url="html/colourSchemes/nucleotideambiguity.html" />
<mapID target="colours.blosum" url="html/colourSchemes/blosum.html" />
<mapID target="colours.pid" url="html/colourSchemes/pid.html" />
<mapID target="colours.user" url="html/colourSchemes/user.html"/>
<tocitem text="Turn propensity" target="colours.turn" />
<tocitem text="Buried index" target="colours.buried" />
<tocitem text="Nucleotide colours" target="colours.nucleotide" />
+ <tocitem text="Nucleotide Ambiguity colours" target="colours.nucleotideambiguity" />
<tocitem text="Purine/Pyrimidine colours" target="colours.purinepyrimidine" />
<tocitem text="Blosum62" target="colours.blosum" />
<tocitem text="by Percentage Identity" target="colours.pid" />
<td>I</td>
<!--Inosine-->
<td>X</td>
- <!--Xanthine-->
+ <!--Unknown (sometimes Xanthine)-->
<td>R</td>
<!--Unknown Purine-->
<td>Y</td>
<!--Unknown Pyrimidine-->
- <td>N</td>
- <!--Unknown-->
<td>W</td>
<!--Weak nucleotide (A or T)-->
<td>S</td>
<td>D</td>
<!--Not C (A or G or T)-->
<td>V</td>
- <!--Not T (A or G or C-->
+ <!--Not T (A or G or C)-->
+ <td>N</td>
+ <!--Unknown-->
</tr>
<tr>
<td height="24">Nucleotide</td>
<td></td>
</tr>
<tr>
+ <td height="24">Nucleotide Ambiguity</td>
+ <td bgcolor="#f0fff0"></td>
+ <td bgcolor="#f0fff0"></td>
+ <td bgcolor="#f0fff0"></td>
+ <td bgcolor="#f0fff0"></td>
+ <td bgcolor="#f0fff0"></td>
+ <td bgcolor="#ffffff"></td>
+ <td bgcolor="#4f6f6f"></td>
+ <td bgcolor="#CD5C5C"></td>
+ <td bgcolor="#008000"></td>
+ <td bgcolor="#4682B4"></td>
+ <td bgcolor="#FF8C00"></td>
+ <td bgcolor="#9ACD32"></td>
+ <td bgcolor="#9932CC"></td>
+ <td bgcolor="#8b4513"></td>
+ <td bgcolor="#808080"></td>
+ <td bgcolor="#483D8B"></td>
+ <td bgcolor="#b8860b"></td>
+ <td bgcolor="#2f4f4f"></td>
+ </tr>
+ <tr>
<td height="24">Purine/Pyrimidine</td>
<td bgcolor="#FF83FA"></td>
<td bgcolor="#40E0D0"></td>
--- /dev/null
+<html>
+<!--
+ * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
+ * Copyright (C) $$Year-Rel$$ The Jalview Authors
+ *
+ * This file is part of Jalview.
+ *
+ * Jalview is free software: you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, either version 3
+ * of the License, or (at your option) any later version.
+ *
+ * Jalview is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+ * PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
+ * The Jalview Authors are detailed in the 'AUTHORS' file.
+ -->
+<head>
+<title>Nucleotide Colour Scheme</title>
+<style type="text/css">
+<!--
+td {
+ text-align: center;
+}
+-->
+</style>
+</head>
+
+<body>
+ <p>
+ <strong>Nucleotide Ambiguity Colours</strong>
+ </p>
+
+
+ <p>
+ This colour scheme was devised by Suzanne Duce and the Jalview Team to highlight ambiguity codes used in nucleotide sequences.
+ </p>
+ <p>
+ The use of X to represent an unknown base is acknowledged, but this is not recommended as the symbol refers to xanthine (see IUPAC-IUB Commission on Biochemical Nomenclature (CBN). <a href="https://iupac.qmul.ac.uk/misc/naabb.html">Abbreviations and Symbols for Nucleic Acids, Polynucleotides and their Constituents.</a>)
+ </p>
+ <div align="center">
+ <table width="200" border="1">
+ <tr>
+ <td bgcolor="#f0fff0">A</td>
+ <td>Adenine</td>
+ </tr>
+ <tr>
+ <td bgcolor="#f0fff0">C</td>
+ <td>Cytosine</td>
+ </tr>
+ <tr>
+ <td bgcolor="#f0fff0">G</td>
+ <td>Guanine</td>
+ </tr>
+ <tr>
+ <td bgcolor="#f0fff0">T</td>
+ <td>Thymine</td>
+ </tr>
+ <tr>
+ <td bgcolor="#f0fff0">U</td>
+ <td>Uracil</td>
+ </tr>
+ <tr>
+ <td bgcolor="#ffffff">I</td>
+ <td>Inosine</td>
+ </tr>
+ <tr>
+ <td bgcolor="#4f6f6f">X</td>
+ <td>Unknown (sometimes Xanthine)</td>
+ </tr>
+ <tr>
+ <td bgcolor="#CD5C5C">R</td>
+ <td>Unknown Purine</td>
+ </tr>
+ <tr>
+ <td bgcolor="#008000">Y</td>
+ <td>Unknown Pyrimidine</td>
+ </tr>
+ <tr>
+ <td bgcolor="#4682B4">W</td>
+ <td>Weak nucleotide (A or T)</td>
+ </tr>
+ <tr>
+ <td bgcolor="#FF8C00">S</td>
+ <td>Strong nucleotide (G or C)</td>
+ </tr>
+ <tr>
+ <td bgcolor="#9ACD32">M</td>
+ <td>Amino (A or C)</td>
+ </tr>
+ <tr>
+ <td bgcolor="#9932CC">K</td>
+ <td>Keto (G or T)</td>
+ </tr>
+ <tr>
+ <td bgcolor="#8b4513">B</td>
+ <td>Not A (G or C or T)</td>
+ </tr>
+ <tr>
+ <td bgcolor="#808080">H</td>
+ <td>Not G (A or C or T)</td>
+ </tr>
+ <tr>
+ <td bgcolor="#483D8B">D</td>
+ <td>Not C (A or G or T)</td>
+ </tr>
+ <tr>
+ <td bgcolor="#b8860b">V</td>
+ <td>Not T (A or G or C)</td>
+ </tr>
+ <tr>
+ <td bgcolor="#2f4f4f">N</td>
+ <td>Unknown</td>
+ </tr>
+ </table>
+ </div>
+</body>
+</html>
Blosum62 Score, Percentage Identity, Zappo, Taylor,
gecos:flower, gecos:blossom, gecos:sunset, gecos:ocean,
Hydrophobicity, Helix Propensity, Strand Propensity, Turn
- Propensity, Buried Index, Nucleotide, Purine/Pyrimidine, User
+ Propensity, Buried Index, Nucleotide, Nucleotide Ambiguity, Purine/Pyrimidine, User
Defined<br>
</strong> <em>See <a href="../colourSchemes/index.html">colours</a>
for a description of all colour schemes.
Blosum62 Score, Percentage Identity, Zappo, Taylor,
gecos:flower, gecos:blossom, gecos:sunset, gecos:ocean,
Hydrophobicity, Helix Propensity, Strand Propensity, Turn
- Propensity, Buried Index, Nucleotide, Purine/Pyrimidine, User
+ Propensity, Buried Index, Nucleotide, Nucleotide Ambiguity, Purine/Pyrimidine, User
Defined<br>
</strong> <em>See <a href="../colourSchemes/index.html">colours</a> for
a description of all colour schemes.
label.colourScheme_buriedindex = Buried Index
label.colourScheme_purine/pyrimidine = Purine/Pyrimidine
label.colourScheme_nucleotide = Nucleotide
+label.colourScheme_nucleotideambiguity = Nucleotide Ambiguity
label.colourScheme_t-coffeescores = T-Coffee Scores
label.colourScheme_rnahelices = By RNA Helices
label.colourScheme_sequenceid = Sequence ID Colour
label.colourScheme_buriedindex = Índice de encubrimiento
label.colourScheme_purine/pyrimidine = Purina/Pirimidina
label.colourScheme_nucleotide = Nucleótido
+label.colourScheme_nucleotideambiguity = Ambigüedad de nucleótido
label.colourScheme_t-coffeescores = Puntuación del T-Coffee
label.colourScheme_rnahelices = Por hélices de RNA
label.colourScheme_sequenceid = Color de ID de secuencia
Turn("Turn Propensity", TurnColourScheme.class),
Buried("Buried Index", BuriedColourScheme.class),
Nucleotide("Nucleotide", NucleotideColourScheme.class),
+ NucleotideAmbiguity("Nucleotide Ambiguity",
+ NucleotideAmbiguityColourScheme.class),
PurinePyrimidine("Purine/Pyrimidine", PurinePyrimidineColourScheme.class),
RNAHelices("RNA Helices", RNAHelicesColour.class),
TCoffee("T-Coffee Scores", TCoffeeColourScheme.class),
- IdColour("Sequence ID", IdColourScheme.class);
+ IdColour("Sequence ID", IdColourScheme.class),;
// RNAInteraction("RNA Interaction type", RNAInteractionColourScheme.class)
private String name;
--- /dev/null
+package jalview.schemes;
+
+import jalview.api.AlignViewportI;
+import jalview.datamodel.AnnotatedCollectionI;
+
+public class NucleotideAmbiguityColourScheme extends ResidueColourScheme
+{
+ /**
+ * Creates a new NucleotideColourScheme object.
+ */
+ public NucleotideAmbiguityColourScheme()
+ {
+ super(ResidueProperties.nucleotideIndex,
+ ResidueProperties.nucleotideAmbiguity);
+ }
+
+ @Override
+ public boolean isNucleotideSpecific()
+ {
+ return true;
+ }
+
+ @Override
+ public String getSchemeName()
+ {
+ return JalviewColourScheme.NucleotideAmbiguity.toString();
+ }
+
+ /**
+ * Returns a new instance of this colour scheme with which the given data may
+ * be coloured
+ */
+ @Override
+ public ColourSchemeI getInstance(AlignViewportI view,
+ AnnotatedCollectionI coll)
+ {
+ return new NucleotideAmbiguityColourScheme();
+ }
+
+}
*/
package jalview.schemes;
-import java.util.Locale;
-
-import jalview.analysis.GeneticCodes;
-
import java.awt.Color;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Hashtable;
import java.util.List;
+import java.util.Locale;
import java.util.Map;
import java.util.Vector;
+import jalview.analysis.GeneticCodes;
+
public class ResidueProperties
{
// Stores residue codes/names and colours and other things
public static final Map<String, String> nucleotideName = new HashMap<>();
+ public static final Map<String, String> nucleotideAmbiguityName = new HashMap<>();
+
// lookup from modified amino acid (e.g. MSE) to canonical form (e.g. MET)
public static final Map<String, String> modifications = new HashMap<>();
/**
* maximum (gap) index for matrices involving nucleotide alphabet
*/
- public final static int maxNucleotideIndex = 10;
+ // public final static int maxNucleotideIndex = 10;
+ public final static int maxNucleotideIndex;
static
{
+ String[][] namesArray = { { "a", "Adenine" }, { "c", "Cytosine" },
+ { "g", "Guanine" },
+ { "t", "Thymine" },
+ { "u", "Uracil" },
+ { "i", "Inosine" },
+ { "x", "Xanthine" },
+ { "r", "Unknown Purine" },
+ { "y", "Unknown Pyrimidine" },
+ { "w", "Weak nucleotide (A or T)" },
+ { "s", "Strong nucleotide (G or C)" },
+ { "m", "Amino (A or C)" },
+ { "k", "Keto (G or T)" },
+ { "b", "Not A (G or C or T)" },
+ { "h", "Not G (A or C or T)" },
+ { "d", "Not C (A or G or T)" },
+ { "v", "Not T (A or G or C)" },
+ { "n", "Unknown" } };
+ // "gap" index
+ maxNucleotideIndex = namesArray.length;
+
nucleotideIndex = new int[255];
for (int i = 0; i < 255; i++)
{
- nucleotideIndex[i] = 10; // non-nucleotide symbols are all non-gap gaps.
+ nucleotideIndex[i] = maxNucleotideIndex; // non-nucleotide symbols are all
+ // non-gap gaps.
}
- nucleotideIndex['A'] = 0;
- nucleotideIndex['a'] = 0;
- nucleotideIndex['C'] = 1;
- nucleotideIndex['c'] = 1;
- nucleotideIndex['G'] = 2;
- nucleotideIndex['g'] = 2;
- nucleotideIndex['T'] = 3;
- nucleotideIndex['t'] = 3;
- nucleotideIndex['U'] = 4;
- nucleotideIndex['u'] = 4;
- nucleotideIndex['I'] = 5;
- nucleotideIndex['i'] = 5;
- nucleotideIndex['X'] = 6;
- nucleotideIndex['x'] = 6;
- nucleotideIndex['R'] = 7;
- nucleotideIndex['r'] = 7;
- nucleotideIndex['Y'] = 8;
- nucleotideIndex['y'] = 8;
- nucleotideIndex['N'] = 9;
- nucleotideIndex['n'] = 9;
-
- nucleotideName.put("A", "Adenine");
- nucleotideName.put("a", "Adenine");
- nucleotideName.put("G", "Guanine");
- nucleotideName.put("g", "Guanine");
- nucleotideName.put("C", "Cytosine");
- nucleotideName.put("c", "Cytosine");
- nucleotideName.put("T", "Thymine");
- nucleotideName.put("t", "Thymine");
- nucleotideName.put("U", "Uracil");
- nucleotideName.put("u", "Uracil");
- nucleotideName.put("I", "Inosine");
- nucleotideName.put("i", "Inosine");
- nucleotideName.put("X", "Xanthine");
- nucleotideName.put("x", "Xanthine");
- nucleotideName.put("R", "Unknown Purine");
- nucleotideName.put("r", "Unknown Purine");
- nucleotideName.put("Y", "Unknown Pyrimidine");
- nucleotideName.put("y", "Unknown Pyrimidine");
- nucleotideName.put("N", "Unknown");
- nucleotideName.put("n", "Unknown");
- nucleotideName.put("W", "Weak nucleotide (A or T)");
- nucleotideName.put("w", "Weak nucleotide (A or T)");
- nucleotideName.put("S", "Strong nucleotide (G or C)");
- nucleotideName.put("s", "Strong nucleotide (G or C)");
- nucleotideName.put("M", "Amino (A or C)");
- nucleotideName.put("m", "Amino (A or C)");
- nucleotideName.put("K", "Keto (G or T)");
- nucleotideName.put("k", "Keto (G or T)");
- nucleotideName.put("B", "Not A (G or C or T)");
- nucleotideName.put("b", "Not A (G or C or T)");
- nucleotideName.put("H", "Not G (A or C or T)");
- nucleotideName.put("h", "Not G (A or C or T)");
- nucleotideName.put("D", "Not C (A or G or T)");
- nucleotideName.put("d", "Not C (A or G or T)");
- nucleotideName.put("V", "Not T (A or G or C");
- nucleotideName.put("v", "Not T (A or G or C");
+ for (int i = 0; i < namesArray.length; i++)
+ {
+ char c = namesArray[i][0].charAt(0);
+ nucleotideIndex[c] = i;
+ // Character.toUpperCase is Locale insensitive
+ nucleotideIndex[Character.toUpperCase(c)] = i;
+ nucleotideName.put(namesArray[i][0], namesArray[i][1]);
+ nucleotideName.put(namesArray[i][0].toUpperCase(Locale.ROOT),
+ namesArray[i][1]);
+ }
}
Color.white, // R
Color.white, // Y
Color.white, // N
+ Color.white, // w
+ Color.white, // s
+ Color.white, // m
+ Color.white, // k
+ Color.white, // b
+ Color.white, // h
+ Color.white, // d
+ Color.white, // v
+ Color.white, // Gap
+ };
+
+ // this colour scheme devised by sduce
+ public static final Color[] nucleotideAmbiguity = {
+ Color.decode("#f0fff0"), // a
+ Color.decode("#f0fff0"), // c
+ Color.decode("#f0fff0"), // g
+ Color.decode("#f0fff0"), // t
+ Color.decode("#f0fff0"), // u
+ Color.decode("#ffffff"), // i
+ Color.decode("#4f6f6f"), // x
+ Color.decode("#CD5C5C"), // r
+ Color.decode("#008000"), // y
+ Color.decode("#4682B4"), // w
+ Color.decode("#FF8C00"), // s
+ Color.decode("#9ACD32"), // m
+ Color.decode("#9932CC"), // k
+ Color.decode("#8b4513"), // b
+ Color.decode("#808080"), // h
+ Color.decode("#483D8B"), // d
+ Color.decode("#b8860b"), // v
+ Color.decode("#2f4f4f"), // n
Color.white, // Gap
};
*/
package jalview.util;
-import jalview.datamodel.SequenceI;
-
import java.util.ArrayList;
import java.util.List;
+import jalview.bin.Cache;
+import jalview.bin.Console;
+import jalview.datamodel.SequenceI;
+
/**
* Assorted methods for analysing or comparing sequences.
*/
{
private static final int EIGHTY_FIVE = 85;
- private static final int TO_UPPER_CASE = 'a' - 'A';
+ private static final int NUCLEOTIDE_COUNT_PERCENT;
+
+ private static final int NUCLEOTIDE_COUNT_LONG_SEQUENCE_AMBIGUITY_PERCENT;
+
+ private static final int NUCLEOTIDE_COUNT_SHORT_SEQUENCE;
+
+ private static final int NUCLEOTIDE_COUNT_VERY_SHORT_SEQUENCE;
+
+ private static final boolean NUCLEOTIDE_AMBIGUITY_DETECTION;
public static final char GAP_SPACE = ' ';
new char[]
{ GAP_SPACE, GAP_DOT, GAP_DASH });
+ static
+ {
+ // these options read only at start of session
+ NUCLEOTIDE_COUNT_PERCENT = Cache.getDefault("NUCLEOTIDE_COUNT_PERCENT",
+ 55);
+ NUCLEOTIDE_COUNT_LONG_SEQUENCE_AMBIGUITY_PERCENT = Cache.getDefault(
+ "NUCLEOTIDE_COUNT_LONG_SEQUENCE_AMBIGUITY_PERCENT", 95);
+ NUCLEOTIDE_COUNT_SHORT_SEQUENCE = Cache
+ .getDefault("NUCLEOTIDE_COUNT_SHORT", 100);
+ NUCLEOTIDE_COUNT_VERY_SHORT_SEQUENCE = Cache
+ .getDefault("NUCLEOTIDE_COUNT_VERY_SHORT", 4);
+ NUCLEOTIDE_AMBIGUITY_DETECTION = Cache
+ .getDefault("NUCLEOTIDE_AMBIGUITY_DETECTION", true);
+ }
+
/**
* DOCUMENT ME!
*
jlen--;
}
- int count = 0;
int match = 0;
float pid = -1;
{
match++;
}
-
- count++;
}
pid = (float) match / (float) ilen * 100;
{
match++;
}
-
- count++;
}
pid = (float) match / (float) jlen * 100;
*/
public static final boolean isGap(char c)
{
- return (c == GAP_DASH || c == GAP_DOT || c == GAP_SPACE) ? true : false;
+ return c == GAP_DASH || c == GAP_DOT || c == GAP_SPACE;
}
/**
*/
public static final boolean isNucleotide(SequenceI seq)
{
- if (seq == null)
+ if (seq == null || seq.getLength() == 0)
{
return false;
}
- long ntCount = 0;
- long aaCount = 0;
- long nCount = 0;
+ long ntCount = 0; // nucleotide symbol count (does not include ntaCount)
+ long aaCount = 0; // non-nucleotide, non-gap symbol count (includes nCount
+ // and ntaCount)
+ long nCount = 0; // "Unknown" (N) symbol count
+ long xCount = 0; // Also used as "Unknown" (X) symbol count
+ long ntaCount = 0; // nucleotide ambiguity symbol count
int len = seq.getLength();
for (int i = 0; i < len; i++)
{
char c = seq.getCharAt(i);
- if (isNucleotide(c) || isX(c))
+ if (isNucleotide(c))
{
ntCount++;
}
{
nCount++;
}
+ else
+ {
+ if (isX(c))
+ {
+ xCount++;
+ }
+ if (isNucleotideAmbiguity(c))
+ {
+ ntaCount++;
+ }
+ }
}
}
- /*
- * Check for nucleotide count > 85% of total count (in a form that evades
- * int / float conversion or divide by zero).
- */
- if ((ntCount + nCount) * 100 > EIGHTY_FIVE * (ntCount + aaCount))
+ long allCount = ntCount + aaCount;
+
+ if (NUCLEOTIDE_AMBIGUITY_DETECTION)
{
- return ntCount > 0; // all N is considered protein. Could use a threshold
- // here too
+ Console.debug("Performing new nucleotide detection routine");
+ if (allCount > NUCLEOTIDE_COUNT_SHORT_SEQUENCE)
+ {
+ // a long sequence.
+ // check for at least 55% nucleotide, and nucleotide and ambiguity codes
+ // (including N) must make up 95%
+ return ntCount * 100 > NUCLEOTIDE_COUNT_PERCENT * allCount
+ && 100 * (ntCount + nCount
+ + ntaCount) > NUCLEOTIDE_COUNT_LONG_SEQUENCE_AMBIGUITY_PERCENT
+ * allCount;
+ }
+ else if (allCount > NUCLEOTIDE_COUNT_VERY_SHORT_SEQUENCE)
+ {
+ // a short sequence.
+ // check if a short sequence is at least 55% nucleotide and the rest of
+ // the symbols are all X or all N
+ if (ntCount * 100 > NUCLEOTIDE_COUNT_PERCENT * allCount
+ && (nCount == aaCount || xCount == aaCount))
+ {
+ return true;
+ }
+
+ // a short sequence.
+ // check for at least x% nucleotide and all the rest nucleotide
+ // ambiguity codes (including N), where x slides from 75% for sequences
+ // of length 4 (i.e. only one non-nucleotide) to 55% for sequences of
+ // length 100
+ return myShortSequenceNucleotideProportionCount(ntCount, allCount)
+ && nCount + ntaCount == aaCount;
+ }
+ else
+ {
+ // a very short sequence. (<4)
+ // all bases must be nucleotide
+ return ntCount > 0 && ntCount == allCount;
+ }
}
else
{
- return false;
+ Console.debug("Performing old nucleotide detection routine");
+ /*
+ * Check for nucleotide count > 85% of total count (in a form that evades
+ * int / float conversion or divide by zero).
+ */
+ if ((ntCount + nCount) * 100 > EIGHTY_FIVE * allCount)
+ {
+ return ntCount > 0; // all N is considered protein. Could use a
+ // threshold here too
+ }
}
+ return false;
+ }
+
+ protected static boolean myShortSequenceNucleotideProportionCount(
+ long ntCount, long allCount)
+ {
+ /**
+ * this method is valid only for NUCLEOTIDE_COUNT_VERY_SHORT_SEQUENCE <=
+ * allCount <= NUCLEOTIDE_COUNT_SHORT_SEQUENCE
+ */
+ // the following is a simplified integer version of:
+ //
+ // a := allCount # the number of bases in the sequence
+ // n : = ntCount # the number of definite nucleotide bases
+ // vs := NUCLEOTIDE_COUNT_VERY_SHORT_SEQUENCE
+ // s := NUCLEOTIDE_COUNT_SHORT_SEQUENCE
+ // lp := NUCLEOTIDE_COUNT_LOWER_PERCENT
+ // vsp := 1 - (1/a) # this is the proportion of required definite
+ // nucleotides
+ // # in a VERY_SHORT Sequence (4 bases).
+ // # This should be equivalent to all but one base in the sequence.
+ // p := (a - vs)/(s - vs) # proportion of the way between
+ // # VERY_SHORT and SHORT thresholds.
+ // tp := vsp + p * (lp/100 - vsp) # the proportion of definite nucleotides
+ // # required for this length of sequence.
+ // minNt := tp * a # the minimum number of definite nucleotide bases
+ // # required for this length of sequence.
+ //
+ // We are then essentially returning:
+ // # ntCount >= 55% of allCount and the rest are all nucleotide ambiguity:
+ // ntCount >= tp * allCount && nCount + ntaCount == aaCount
+ // but without going into float/double land
+ long LHS = 100 * allCount
+ * (NUCLEOTIDE_COUNT_SHORT_SEQUENCE
+ - NUCLEOTIDE_COUNT_VERY_SHORT_SEQUENCE)
+ * (ntCount - allCount + 1);
+ long RHS = allCount * (allCount - NUCLEOTIDE_COUNT_VERY_SHORT_SEQUENCE)
+ * (allCount * NUCLEOTIDE_COUNT_PERCENT - 100 * allCount + 100);
+ return LHS >= RHS;
}
/**
*/
public static boolean isNucleotide(char c)
{
- if ('a' <= c && c <= 'z')
- {
- c -= TO_UPPER_CASE;
- }
- switch (c)
+ return isNucleotide(c, false);
+ }
+
+ /**
+ * includeAmbiguity = true also includes Nucleotide Ambiguity codes
+ */
+ public static boolean isNucleotide(char c, boolean includeAmbiguity)
+ {
+ char C = Character.toUpperCase(c);
+ switch (C)
{
case 'A':
case 'C':
case 'U':
return true;
}
+ if (includeAmbiguity)
+ {
+ boolean ambiguity = isNucleotideAmbiguity(C);
+ if (ambiguity)
+ return true;
+ }
return false;
}
- public static boolean isN(char c)
+ /**
+ * Tests *only* nucleotide ambiguity codes (and not definite nucleotide codes)
+ */
+ public static boolean isNucleotideAmbiguity(char c)
{
- switch (c)
+ switch (Character.toUpperCase(c))
{
- case 'N':
- case 'n':
+ case 'I':
+ case 'X':
+ case 'R':
+ case 'Y':
+ case 'W':
+ case 'S':
+ case 'M':
+ case 'K':
+ case 'B':
+ case 'H':
+ case 'D':
+ case 'V':
return true;
+ case 'N': // not counting N as nucleotide
}
return false;
}
+ public static boolean isN(char c)
+ {
+ return 'n' == Character.toLowerCase(c);
+ }
+
public static boolean isX(char c)
{
- switch (c)
- {
- case 'X':
- case 'x':
- return true;
- }
- return false;
+ return 'x' == Character.toLowerCase(c);
}
/**
*/
public static boolean isNucleotideSequence(String s, boolean allowGaps)
{
+ return isNucleotideSequence(s, allowGaps, false);
+ }
+
+ public static boolean isNucleotideSequence(String s, boolean allowGaps,
+ boolean includeAmbiguous)
+ {
if (s == null)
{
return false;
for (int i = 0; i < s.length(); i++)
{
char c = s.charAt(i);
- if (!isNucleotide(c))
+ if (!isNucleotide(c, includeAmbiguous))
{
if (!allowGaps || !isGap(c))
{
public static boolean isSameResidue(char c1, char c2,
boolean caseSensitive)
{
- if (caseSensitive)
- {
- return (c1 == c2);
- }
- else
- {
- return Character.toUpperCase(c1) == Character.toUpperCase(c2);
- }
+ return caseSensitive ? c1 == c2
+ : Character.toUpperCase(c1) == Character.toUpperCase(c2);
}
}
import static org.testng.AssertJUnit.assertSame;
import static org.testng.AssertJUnit.assertTrue;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import org.testng.annotations.AfterClass;
+import org.testng.annotations.BeforeClass;
+import org.testng.annotations.BeforeMethod;
+import org.testng.annotations.Test;
+
+import jalview.bin.Cache;
import jalview.datamodel.AlignedCodonFrame;
import jalview.datamodel.AlignedCodonFrame.SequenceToSequenceMapping;
import jalview.datamodel.Alignment;
import jalview.util.MapList;
import jalview.ws.SequenceFetcher;
import jalview.ws.SequenceFetcherFactory;
-import jalview.ws.params.InvalidArgumentException;
-
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.List;
-
-import org.testng.annotations.AfterClass;
-import org.testng.annotations.BeforeClass;
-import org.testng.annotations.Test;
public class CrossRefTest
{
JvOptionPane.setMockResponse(JvOptionPane.CANCEL_OPTION);
}
+ @BeforeMethod(alwaysRun = true)
+ public void loadProperties()
+ {
+ Cache.loadProperties("test/jalview/util/comparisonTestProps.jvprops");
+ }
+
@Test(groups = { "Functional" })
public void testFindXDbRefs()
{
AlignmentI al = new Alignment(new SequenceI[] { emblSeq, uniprotSeq });
Alignment xrefs = new CrossRef(new SequenceI[] { emblSeq }, al)
.findXrefSequences("UNIPROT", true);
+ System.err.println("xrefs=" + xrefs);
assertEquals(1, xrefs.getHeight());
assertSame(uniprotSeq, xrefs.getSequenceAt(0));
}
*/
package jalview.datamodel;
-import java.util.Locale;
-
import static org.testng.AssertJUnit.assertEquals;
import static org.testng.AssertJUnit.assertFalse;
import static org.testng.AssertJUnit.assertNotNull;
import static org.testng.AssertJUnit.assertSame;
import static org.testng.AssertJUnit.assertTrue;
-import jalview.analysis.AlignmentGenerator;
-import jalview.commands.EditCommand;
-import jalview.commands.EditCommand.Action;
-import jalview.datamodel.PDBEntry.Type;
-import jalview.gui.JvOptionPane;
-import jalview.util.MapList;
-import jalview.ws.params.InvalidArgumentException;
-
import java.io.File;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.BitSet;
import java.util.Iterator;
import java.util.List;
+import java.util.Locale;
import java.util.Vector;
import org.testng.Assert;
import org.testng.annotations.BeforeMethod;
import org.testng.annotations.Test;
+import jalview.analysis.AlignmentGenerator;
+import jalview.bin.Cache;
+import jalview.commands.EditCommand;
+import jalview.commands.EditCommand.Action;
+import jalview.datamodel.PDBEntry.Type;
+import jalview.gui.JvOptionPane;
+import jalview.util.MapList;
import junit.extensions.PA;
public class SequenceTest
JvOptionPane.setMockResponse(JvOptionPane.CANCEL_OPTION);
}
+ @BeforeMethod(alwaysRun = true)
+ public void loadProperties()
+ {
+ Cache.loadProperties("test/jalview/util/comparisonTestProps.jvprops");
+ }
+
Sequence seq;
@BeforeMethod(alwaysRun = true)
assertTrue(new Sequence("prot", "ASDFASDFASDFXXXXXXXXX").isProtein());
// test DNA with X
assertFalse(new Sequence("prot", "ACGTACGTACGTXXXXXXXX").isProtein());
+ // short sequence is nucleotide only if 50% is nucleotide and remaining N/X
+ // is either N or X only
+ assertTrue(new Sequence("prot", "ACGTACGTACGTXN").isProtein());
// test DNA with N
assertFalse(new Sequence("prot", "ACGTACGTACGTNNNNNNNN").isProtein());
// test RNA with X
+ assertFalse(new Sequence("prot", "ACGUACGUACGUACTGACAXX").isProtein());
assertFalse(new Sequence("prot", "ACGUACGUACGUXXXXXXXXX").isProtein());
assertFalse(new Sequence("prot", "ACGUACGUACGUNNNNNNNNN").isProtein());
}
import static org.testng.AssertJUnit.assertEquals;
import static org.testng.AssertJUnit.assertNull;
-import jalview.gui.JvOptionPane;
-
import java.util.Collections;
import java.util.List;
import java.util.Map;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.Test;
+import jalview.gui.JvOptionPane;
+
public class ResiduePropertiesTest
{
*/
residues = ResidueProperties.getResidues(true, true);
Collections.sort(residues);
- assertEquals("[A, C, G, I, N, R, T, U, X, Y]", residues.toString());
+ assertEquals("[A, B, C, D, G, H, I, K, M, N, R, S, T, U, V, W, X, Y]",
+ residues.toString());
}
@Test(groups = { "Functional" })
import static org.testng.AssertJUnit.assertFalse;
import static org.testng.AssertJUnit.assertTrue;
+import org.testng.annotations.BeforeClass;
+import org.testng.annotations.BeforeMethod;
+import org.testng.annotations.Test;
+
+import jalview.bin.Cache;
import jalview.datamodel.Sequence;
import jalview.datamodel.SequenceI;
import jalview.gui.JvOptionPane;
-import org.testng.annotations.BeforeClass;
-import org.testng.annotations.Test;
-
public class ComparisonTest
{
JvOptionPane.setMockResponse(JvOptionPane.CANCEL_OPTION);
}
+ @BeforeMethod(alwaysRun = true)
+ public void loadProperties()
+ {
+ Cache.loadProperties("test/jalview/util/comparisonTestProps.jvprops");
+ }
+
@Test(groups = { "Functional" })
public void testIsGap()
{
@Test(groups = { "Functional" })
public void testIsNucleotide_sequences()
{
- SequenceI seq = new Sequence("eightypercent", "agctuAGCPV");
+ SequenceI seq = new Sequence("eightypercent+fivepercent", "agctuagcPV");
assertFalse(Comparison.isNucleotide(new SequenceI[] { seq }));
assertFalse(
Comparison.isNucleotide(new SequenceI[][]
{ new SequenceI[] { seq } }));
- seq = new Sequence("eightyfivepercent", "agctuAGCPVagctuAGCUV");
+ seq = new Sequence("eightyfivepercent+tenpercent",
+ "agctuagcgVagctuagcuVE");
+ assertFalse(Comparison.isNucleotide(new SequenceI[] { seq }));
+
+ seq = new Sequence(">nineyfivepercent+0percent",
+ "aagctuagcgEagctuagcua");
+ assertFalse(Comparison.isNucleotide(new SequenceI[] { seq }));
+
+ seq = new Sequence("nineyfivepercent+0percent", "agctuagcgEagctuagcua");
assertFalse(Comparison.isNucleotide(new SequenceI[] { seq }));
- seq = new Sequence("nineypercent", "agctuAGCgVagctuAGCUV");
+ seq = new Sequence("nineyfivepercent+fivepercent",
+ "agctuagcgWagctuagcua");
assertTrue(Comparison.isNucleotide(new SequenceI[] { seq }));
+ seq = new Sequence("nineyfivepercent+tenpercent",
+ "agctuagcgEWWctuagcua");
+ assertFalse(Comparison.isNucleotide(new SequenceI[] { seq }));
+
+ seq = new Sequence("eightyfivepercent+fifteenpercent",
+ "agctuagcgWWWctuagcua");
+ assertFalse(Comparison.isNucleotide(new SequenceI[] { seq }));
+
seq = new Sequence("eightyfivepercentgapped",
"--agc--tuA--GCPV-a---gct-uA-GC---UV");
assertFalse(Comparison.isNucleotide(new SequenceI[] { seq }));
- seq = new Sequence("nineypercentgapped",
- "ag--ct-u-A---GC---g----Vag--c---tuAGCUV");
+ seq = new Sequence("ninetyfivepercentgapped",
+ "ag--ct-u-a---gc---g----aag--c---tuagcuV");
assertTrue(Comparison.isNucleotide(new SequenceI[] { seq }));
seq = new Sequence("allgap", "---------");
new SequenceI[]
{ seq, seq, seq, seq, seq2, seq2, null } }));
- seq = new Sequence("ProteinThatLooksLikeDNA", "WYATGCCTGAgtcgt");
- // 12/14 = 85.7%
+ String seqString = "aaatatatatgEcctgagtcgt";
+ seq = new Sequence("ShortProteinThatLooksLikeDNA", seqString);
+ assertFalse(Comparison.isNucleotide(new SequenceI[] { seq }));
+ seq = new Sequence("LongProteinThatLooksLikeDNA", seqString.repeat(10));
assertTrue(Comparison.isNucleotide(new SequenceI[] { seq }));
assertFalse(Comparison.isNucleotide((SequenceI[]) null));
assertFalse(Comparison.isNucleotide('P'));
}
+ @Test(groups = { "Functional" })
+ public void testIsNucleotideAmbiguity()
+ {
+ assertTrue(Comparison.isNucleotide('b', true));
+ assertTrue(Comparison.isNucleotide('B', true));
+ assertTrue(Comparison.isNucleotide('d', true));
+ assertTrue(Comparison.isNucleotide('V', true));
+ assertTrue(Comparison.isNucleotide('M', true));
+ assertTrue(Comparison.isNucleotide('s', true));
+ assertTrue(Comparison.isNucleotide('W', true));
+ assertTrue(Comparison.isNucleotide('x', true));
+ assertTrue(Comparison.isNucleotide('Y', true));
+ assertTrue(Comparison.isNucleotide('r', true));
+ assertTrue(Comparison.isNucleotide('i', true));
+ assertFalse(Comparison.isNucleotide('-', true));
+ assertFalse(Comparison.isNucleotide('n', true));
+ assertFalse(Comparison.isNucleotide('P', true));
+ }
+
/**
* Test the percentage identity calculation for two sequences
*/
assertFalse(Comparison.isNucleotideSequence("aAgGcCtTuUx", false));
assertTrue(Comparison.isNucleotideSequence("a A-g.GcCtTuU", true));
assertFalse(Comparison.isNucleotideSequence("a A-g.GcCtTuU", false));
+ assertFalse(Comparison.isNucleotideSequence("gatactawgataca", false));
+ // including nucleotide ambiguity
+ assertTrue(
+ Comparison.isNucleotideSequence("gatacaWgataca", true, true));
+ assertFalse(
+ Comparison.isNucleotideSequence("gatacaEgataca", true, true));
+
+ // not quite all nucleotides and ambiguity codes
+ Sequence seq = new Sequence("Ambiguity DNA codes", "gatacagatacabve");
+ assertFalse(Comparison.isNucleotide(seq));
+ // all nucleotide and nucleotide ambiguity codes
+ seq = new Sequence("Ambiguity DNA codes", "gatacagatacabvt");
+ assertFalse(Comparison.isNucleotide(seq));
+ seq = new Sequence("Ambiguity DNA codes", "agatacabb");
+ assertFalse(Comparison.isNucleotide(seq));
+ // 55% nucleotide with only Xs or Ns
+ assertTrue(Comparison
+ .isNucleotide(new Sequence("dnaWithXs", "gatacaXXXX")));
+ assertTrue(Comparison
+ .isNucleotide(new Sequence("dnaWithXs", "gatacaNNNN")));
+ assertFalse(Comparison
+ .isNucleotide(new Sequence("dnaWithXs", "gatacXXXXX")));
+ assertFalse(Comparison
+ .isNucleotide(new Sequence("dnaWithXs", "gatacNNNNN")));
}
@Test(groups = { "Functional" })
assertFalse(Comparison.isSameResidue('a', 'A', true));
assertFalse(Comparison.isSameResidue('A', 'a', true));
}
+
+ @Test(groups = { "Functional" })
+ public void testNucleotideProportion()
+ {
+ assertFalse(Comparison.myShortSequenceNucleotideProportionCount(2, 3));
+ assertTrue(Comparison.myShortSequenceNucleotideProportionCount(3, 3));
+ assertFalse(Comparison.myShortSequenceNucleotideProportionCount(2, 4));
+ assertTrue(Comparison.myShortSequenceNucleotideProportionCount(3, 4));
+ assertFalse(
+ Comparison.myShortSequenceNucleotideProportionCount(17, 20));
+ assertTrue(Comparison.myShortSequenceNucleotideProportionCount(18, 20));
+ assertFalse(
+ Comparison.myShortSequenceNucleotideProportionCount(38, 50));
+ assertTrue(Comparison.myShortSequenceNucleotideProportionCount(39, 50));
+ assertFalse(
+ Comparison.myShortSequenceNucleotideProportionCount(54, 100));
+ assertTrue(
+ Comparison.myShortSequenceNucleotideProportionCount(55, 100));
+ }
}
--- /dev/null
+NUCLEOTIDE_AMBIGUITY_DETECTION=true
+NUCLEOTIDE_COUNT_PERCENT=55
+NUCLEOTIDE_COUNT_LONG_SEQUENCE_AMBIGUITY_PERCENT=95
+NUCLEOTIDE_COUNT_SHORT=100
+NUCLEOTIDE_COUNT_VERY_SHORT=4