2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
23 import java.util.ArrayList;
24 import java.util.List;
26 import jalview.bin.Cache;
27 import jalview.bin.Console;
28 import jalview.datamodel.SequenceI;
31 * Assorted methods for analysing or comparing sequences.
33 public class Comparison
35 private static final int EIGHTY_FIVE = 85;
37 private static final int NUCLEOTIDE_COUNT_PERCENT;
39 private static final int NUCLEOTIDE_COUNT_LONG_SEQUENCE_AMBIGUITY_PERCENT;
41 private static final int NUCLEOTIDE_COUNT_SHORT_SEQUENCE;
43 private static final int NUCLEOTIDE_COUNT_VERY_SHORT_SEQUENCE;
45 public static final char GAP_SPACE = ' ';
47 public static final char GAP_DOT = '.';
49 public static final char GAP_DASH = '-';
51 public static final String GapChars = new String(
53 { GAP_SPACE, GAP_DOT, GAP_DASH });
57 NUCLEOTIDE_COUNT_PERCENT = Cache.getDefault("NUCLEOTIDE_COUNT_PERCENT",
59 NUCLEOTIDE_COUNT_LONG_SEQUENCE_AMBIGUITY_PERCENT = Cache.getDefault(
60 "NUCLEOTIDE_COUNT_LONG_SEQUENCE_AMBIGUITY_PERCENT", 95);
61 NUCLEOTIDE_COUNT_SHORT_SEQUENCE = Cache
62 .getDefault("NUCLEOTIDE_COUNT_SHORT", 100);
63 NUCLEOTIDE_COUNT_VERY_SHORT_SEQUENCE = Cache
64 .getDefault("NUCLEOTIDE_COUNT_VERY_SHORT", 4);
75 * @return DOCUMENT ME!
77 public static final float compare(SequenceI ii, SequenceI jj)
79 return Comparison.compare(ii, jj, 0, ii.getLength() - 1);
83 * this was supposed to be an ungapped pid calculation
95 public static float compare(SequenceI ii, SequenceI jj, int start,
98 String si = ii.getSequenceAsString();
99 String sj = jj.getSequenceAsString();
101 int ilen = si.length() - 1;
102 int jlen = sj.length() - 1;
104 while (Comparison.isGap(si.charAt(start + ilen)))
109 while (Comparison.isGap(sj.charAt(start + jlen)))
119 for (int j = 0; j < jlen; j++)
121 if (si.substring(start + j, start + j + 1)
122 .equals(sj.substring(start + j, start + j + 1)))
128 pid = (float) match / (float) ilen * 100;
132 for (int j = 0; j < jlen; j++)
134 if (si.substring(start + j, start + j + 1)
135 .equals(sj.substring(start + j, start + j + 1)))
141 pid = (float) match / (float) jlen * 100;
148 * this is a gapped PID calculation
155 * @deprecated use PIDModel.computePID()
158 public final static float PID(String seq1, String seq2)
160 return PID(seq1, seq2, 0, seq1.length());
163 static final int caseShift = 'a' - 'A';
165 // Another pid with region specification
167 * @deprecated use PIDModel.computePID()
170 public final static float PID(String seq1, String seq2, int start,
173 return PID(seq1, seq2, start, end, true, false);
177 * Calculate percent identity for a pair of sequences over a particular range,
178 * with different options for ignoring gaps.
187 * - if true - gaps match any character, if false, do not match
189 * @param ungappedOnly
190 * - if true - only count PID over ungapped columns
192 * @deprecated use PIDModel.computePID()
195 public final static float PID(String seq1, String seq2, int start,
196 int end, boolean wcGaps, boolean ungappedOnly)
198 int s1len = seq1.length();
199 int s2len = seq2.length();
201 int len = Math.min(s1len, s2len);
210 start = len - 1; // we just use a single residue for the difference
213 int elen = len - start, bad = 0;
217 for (int i = start; i < len; i++)
219 chr1 = seq1.charAt(i);
221 chr2 = seq2.charAt(i);
222 agap = isGap(chr1) || isGap(chr2);
223 if ('a' <= chr1 && chr1 <= 'z')
226 // Faster than toUpperCase
229 if ('a' <= chr2 && chr2 <= 'z')
232 // Faster than toUpperCase
260 return ((float) 100 * (elen - bad)) / elen;
264 * Answers true if the supplied character is a recognised gap character, else
265 * false. Currently hard-coded to recognise '-', '-' or ' ' (hyphen / dot /
272 public static final boolean isGap(char c)
274 return c == GAP_DASH || c == GAP_DOT || c == GAP_SPACE;
278 * Overloaded method signature to test whether a single sequence is nucleotide
279 * (that is, more than 85% CGTAUNX)
284 public static final boolean isNucleotide(SequenceI seq)
286 if (seq == null || seq.getLength() == 0)
290 long ntCount = 0; // nucleotide symbol count (does not include ntaCount)
291 long aaCount = 0; // non-nucleotide, non-gap symbol count (includes nCount
293 long nCount = 0; // "Unknown" (N) symbol count
294 long xCount = 0; // Also used as "Unknown" (X) symbol count
295 long ntaCount = 0; // nucleotide ambiguity symbol count
297 int len = seq.getLength();
298 for (int i = 0; i < len; i++)
300 char c = seq.getCharAt(i);
318 if (isNucleotideAmbiguity(c))
325 long allCount = ntCount + aaCount;
327 if (Cache.getDefault("NUCLEOTIDE_AMBIGUITY_DETECTION", true))
329 Console.info("Performing new nucleotide detection routine");
330 if (allCount > NUCLEOTIDE_COUNT_SHORT_SEQUENCE)
333 // check for at least 55% nucleotide, and nucleotide and ambiguity codes
334 // (including N) must make up 95%
335 return ntCount * 100 > NUCLEOTIDE_COUNT_PERCENT * allCount
336 && 100 * (ntCount + nCount
337 + ntaCount) > NUCLEOTIDE_COUNT_LONG_SEQUENCE_AMBIGUITY_PERCENT
340 else if (allCount > NUCLEOTIDE_COUNT_VERY_SHORT_SEQUENCE)
343 // check if a short sequence is at least 55% nucleotide and the rest of
344 // the symbols are all X or all N
345 if (ntCount * 100 > NUCLEOTIDE_COUNT_PERCENT * allCount
346 && (nCount == aaCount || xCount == aaCount))
352 // check for at least x% nucleotide and all the rest nucleotide
353 // ambiguity codes (including N), where x slides from 75% for sequences
354 // of length 4 (i.e. only one non-nucleotide) to 55% for sequences of
356 return myShortSequenceNucleotideProportionCount(ntCount, allCount)
357 && nCount + ntaCount == aaCount;
361 // a very short sequence. (<4)
362 // all bases must be nucleotide
363 return ntCount > 0 && ntCount == allCount;
368 Console.info("Performing old nucleotide detection routine");
370 * Check for nucleotide count > 85% of total count (in a form that evades
371 * int / float conversion or divide by zero).
373 if ((ntCount + nCount) * 100 > EIGHTY_FIVE * allCount)
375 return ntCount > 0; // all N is considered protein. Could use a
376 // threshold here too
382 protected static boolean myShortSequenceNucleotideProportionCount(
383 long ntCount, long allCount)
386 * this method is valid only for NUCLEOTIDE_COUNT_VERY_SHORT_SEQUENCE <=
387 * allCount <= NUCLEOTIDE_COUNT_SHORT_SEQUENCE
389 // the following is a simplified integer version of:
391 // a := allCount # the number of bases in the sequence
392 // n : = ntCount # the number of definite nucleotide bases
393 // vs := NUCLEOTIDE_COUNT_VERY_SHORT_SEQUENCE
394 // s := NUCLEOTIDE_COUNT_SHORT_SEQUENCE
395 // lp := NUCLEOTIDE_COUNT_LOWER_PERCENT
396 // vsp := 1 - (1/a) # this is the proportion of required nucleotides in
397 // # a VERY_SHORT Sequence (4 bases).
398 // # should be all but one base is nucleotide.
399 // p := (a - vs)/(s - vs) # proportion of the way between
400 // # VERY_SHORT and SHORT thresholds.
401 // tp := vsp + p * (lp/100 - vsp) # the proportion of nucleotides
402 // # required for this length of sequence.
403 // minNt := tp * a # the minimum number of definite nucleotide bases
404 // # required for this length of sequences.
406 // We are then essentially returning:
407 // # ntCount >= 55% of allCount and the rest are all nucleotide ambiguity
408 // ntCount >= tp * allCount && nCount + ntaCount == aaCount
409 // but without going into float/double land
410 long LHS = 100 * allCount
411 * (NUCLEOTIDE_COUNT_SHORT_SEQUENCE
412 - NUCLEOTIDE_COUNT_VERY_SHORT_SEQUENCE)
413 * (ntCount - allCount + 1);
414 long RHS = allCount * (allCount - NUCLEOTIDE_COUNT_VERY_SHORT_SEQUENCE)
415 * (allCount * NUCLEOTIDE_COUNT_PERCENT - 100 * allCount + 100);
420 * Answers true if more than 85% of the sequence residues (ignoring gaps) are
421 * A, G, C, T or U, else false. This is just a heuristic guess and may give a
422 * wrong answer (as AGCT are also amino acid codes).
427 public static final boolean isNucleotide(SequenceI[] seqs)
433 // true if we have seen a nucleotide sequence
435 for (SequenceI seq : seqs)
442 // TODO could possibly make an informed guess just from the first sequence
443 // to save a lengthy calculation
446 // if even one looks like protein, the alignment is protein
454 * Answers true if the character is one of aAcCgGtTuU
459 public static boolean isNucleotide(char c)
461 return isNucleotide(c, false);
465 * includeAmbiguity = true also includes Nucleotide Ambiguity codes
467 public static boolean isNucleotide(char c, boolean includeAmbiguity)
469 char C = Character.toUpperCase(c);
479 if (includeAmbiguity)
481 boolean ambiguity = isNucleotideAmbiguity(C);
489 * Tests *only* nucleotide ambiguity codes (and not definite nucleotide codes)
491 public static boolean isNucleotideAmbiguity(char c)
493 switch (Character.toUpperCase(c))
508 case 'N': // not counting N as nucleotide
513 public static boolean isN(char c)
515 return 'n' == Character.toLowerCase(c);
518 public static boolean isX(char c)
520 return 'x' == Character.toLowerCase(c);
524 * Answers true if every character in the string is one of aAcCgGtTuU, or
525 * (optionally) a gap character (dot, dash, space), else false
531 public static boolean isNucleotideSequence(String s, boolean allowGaps)
533 return isNucleotideSequence(s, allowGaps, false);
536 public static boolean isNucleotideSequence(String s, boolean allowGaps,
537 boolean includeAmbiguous)
543 for (int i = 0; i < s.length(); i++)
545 char c = s.charAt(i);
546 if (!isNucleotide(c, includeAmbiguous))
548 if (!allowGaps || !isGap(c))
558 * Convenience overload of isNucleotide
563 public static boolean isNucleotide(SequenceI[][] seqs)
569 List<SequenceI> flattened = new ArrayList<SequenceI>();
570 for (SequenceI[] ss : seqs)
572 for (SequenceI s : ss)
577 final SequenceI[] oneDArray = flattened
578 .toArray(new SequenceI[flattened.size()]);
579 return isNucleotide(oneDArray);
583 * Compares two residues either case sensitively or case insensitively
584 * depending on the caseSensitive flag
589 * second char to compare with
590 * @param caseSensitive
591 * if true comparison will be case sensitive otherwise its not
594 public static boolean isSameResidue(char c1, char c2,
595 boolean caseSensitive)
597 return caseSensitive ? c1 == c2
598 : Character.toUpperCase(c1) == Character.toUpperCase(c2);