2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
23 import java.util.ArrayList;
24 import java.util.List;
26 import jalview.bin.Cache;
27 import jalview.bin.Console;
28 import jalview.datamodel.SequenceI;
31 * Assorted methods for analysing or comparing sequences.
33 public class Comparison
35 private static final int EIGHTY_FIVE = 85;
37 private static final int NUCLEOTIDE_COUNT_PERCENT;
39 private static final int NUCLEOTIDE_COUNT_LONG_SEQUENCE_AMBIGUITY_PERCENT;
41 private static final int NUCLEOTIDE_COUNT_SHORT_SEQUENCE;
43 private static final int NUCLEOTIDE_COUNT_VERY_SHORT_SEQUENCE;
45 private static final boolean NUCLEOTIDE_AMBIGUITY_DETECTION;
47 public static final char GAP_SPACE = ' ';
49 public static final char GAP_DOT = '.';
51 public static final char GAP_DASH = '-';
53 public static final String GapChars = new String(
55 { GAP_SPACE, GAP_DOT, GAP_DASH });
59 // these options read only at start of session
60 NUCLEOTIDE_COUNT_PERCENT = Cache.getDefault("NUCLEOTIDE_COUNT_PERCENT",
62 NUCLEOTIDE_COUNT_LONG_SEQUENCE_AMBIGUITY_PERCENT = Cache.getDefault(
63 "NUCLEOTIDE_COUNT_LONG_SEQUENCE_AMBIGUITY_PERCENT", 95);
64 NUCLEOTIDE_COUNT_SHORT_SEQUENCE = Cache
65 .getDefault("NUCLEOTIDE_COUNT_SHORT", 100);
66 NUCLEOTIDE_COUNT_VERY_SHORT_SEQUENCE = Cache
67 .getDefault("NUCLEOTIDE_COUNT_VERY_SHORT", 4);
68 NUCLEOTIDE_AMBIGUITY_DETECTION = Cache
69 .getDefault("NUCLEOTIDE_AMBIGUITY_DETECTION", true);
80 * @return DOCUMENT ME!
82 public static final float compare(SequenceI ii, SequenceI jj)
84 return Comparison.compare(ii, jj, 0, ii.getLength() - 1);
88 * this was supposed to be an ungapped pid calculation
100 public static float compare(SequenceI ii, SequenceI jj, int start,
103 String si = ii.getSequenceAsString();
104 String sj = jj.getSequenceAsString();
106 int ilen = si.length() - 1;
107 int jlen = sj.length() - 1;
109 while (Comparison.isGap(si.charAt(start + ilen)))
114 while (Comparison.isGap(sj.charAt(start + jlen)))
124 for (int j = 0; j < jlen; j++)
126 if (si.substring(start + j, start + j + 1)
127 .equals(sj.substring(start + j, start + j + 1)))
133 pid = (float) match / (float) ilen * 100;
137 for (int j = 0; j < jlen; j++)
139 if (si.substring(start + j, start + j + 1)
140 .equals(sj.substring(start + j, start + j + 1)))
146 pid = (float) match / (float) jlen * 100;
153 * this is a gapped PID calculation
160 * @deprecated use PIDModel.computePID()
163 public final static float PID(String seq1, String seq2)
165 return PID(seq1, seq2, 0, seq1.length());
168 static final int caseShift = 'a' - 'A';
170 // Another pid with region specification
172 * @deprecated use PIDModel.computePID()
175 public final static float PID(String seq1, String seq2, int start,
178 return PID(seq1, seq2, start, end, true, false);
182 * Calculate percent identity for a pair of sequences over a particular range,
183 * with different options for ignoring gaps.
192 * - if true - gaps match any character, if false, do not match
194 * @param ungappedOnly
195 * - if true - only count PID over ungapped columns
197 * @deprecated use PIDModel.computePID()
200 public final static float PID(String seq1, String seq2, int start,
201 int end, boolean wcGaps, boolean ungappedOnly)
203 int s1len = seq1.length();
204 int s2len = seq2.length();
206 int len = Math.min(s1len, s2len);
215 start = len - 1; // we just use a single residue for the difference
218 int elen = len - start, bad = 0;
222 for (int i = start; i < len; i++)
224 chr1 = seq1.charAt(i);
226 chr2 = seq2.charAt(i);
227 agap = isGap(chr1) || isGap(chr2);
228 if ('a' <= chr1 && chr1 <= 'z')
231 // Faster than toUpperCase
234 if ('a' <= chr2 && chr2 <= 'z')
237 // Faster than toUpperCase
265 return ((float) 100 * (elen - bad)) / elen;
269 * Answers true if the supplied character is a recognised gap character, else
270 * false. Currently hard-coded to recognise '-', '-' or ' ' (hyphen / dot /
277 public static final boolean isGap(char c)
279 return c == GAP_DASH || c == GAP_DOT || c == GAP_SPACE;
283 * Overloaded method signature to test whether a single sequence is nucleotide
284 * (that is, more than 85% CGTAUNX)
289 public static final boolean isNucleotide(SequenceI seq)
291 if (seq == null || seq.getLength() == 0)
295 long ntCount = 0; // nucleotide symbol count (does not include ntaCount)
296 long aaCount = 0; // non-nucleotide, non-gap symbol count (includes nCount
298 long nCount = 0; // "Unknown" (N) symbol count
299 long xCount = 0; // Also used as "Unknown" (X) symbol count
300 long ntaCount = 0; // nucleotide ambiguity symbol count
302 int len = seq.getLength();
303 for (int i = 0; i < len; i++)
305 char c = seq.getCharAt(i);
323 if (isNucleotideAmbiguity(c))
330 long allCount = ntCount + aaCount;
332 if (NUCLEOTIDE_AMBIGUITY_DETECTION)
334 Console.debug("Performing new nucleotide detection routine");
335 if (allCount > NUCLEOTIDE_COUNT_SHORT_SEQUENCE)
338 // check for at least 55% nucleotide, and nucleotide and ambiguity codes
339 // (including N) must make up 95%
340 return ntCount * 100 > NUCLEOTIDE_COUNT_PERCENT * allCount
341 && 100 * (ntCount + nCount
342 + ntaCount) > NUCLEOTIDE_COUNT_LONG_SEQUENCE_AMBIGUITY_PERCENT
345 else if (allCount > NUCLEOTIDE_COUNT_VERY_SHORT_SEQUENCE)
348 // check if a short sequence is at least 55% nucleotide and the rest of
349 // the symbols are all X or all N
350 if (ntCount * 100 > NUCLEOTIDE_COUNT_PERCENT * allCount
351 && (nCount == aaCount || xCount == aaCount))
357 // check for at least x% nucleotide and all the rest nucleotide
358 // ambiguity codes (including N), where x slides from 75% for sequences
359 // of length 4 (i.e. only one non-nucleotide) to 55% for sequences of
361 return myShortSequenceNucleotideProportionCount(ntCount, allCount)
362 && nCount + ntaCount == aaCount;
366 // a very short sequence. (<4)
367 // all bases must be nucleotide
368 return ntCount > 0 && ntCount == allCount;
373 Console.debug("Performing old nucleotide detection routine");
375 * Check for nucleotide count > 85% of total count (in a form that evades
376 * int / float conversion or divide by zero).
378 if ((ntCount + nCount) * 100 > EIGHTY_FIVE * allCount)
380 return ntCount > 0; // all N is considered protein. Could use a
381 // threshold here too
387 protected static boolean myShortSequenceNucleotideProportionCount(
388 long ntCount, long allCount)
391 * this method is valid only for NUCLEOTIDE_COUNT_VERY_SHORT_SEQUENCE <=
392 * allCount <= NUCLEOTIDE_COUNT_SHORT_SEQUENCE
394 // the following is a simplified integer version of:
396 // a := allCount # the number of bases in the sequence
397 // n : = ntCount # the number of definite nucleotide bases
398 // vs := NUCLEOTIDE_COUNT_VERY_SHORT_SEQUENCE
399 // s := NUCLEOTIDE_COUNT_SHORT_SEQUENCE
400 // lp := NUCLEOTIDE_COUNT_LOWER_PERCENT
401 // vsp := 1 - (1/a) # this is the proportion of required definite
403 // # in a VERY_SHORT Sequence (4 bases).
404 // # This should be equivalent to all but one base in the sequence.
405 // p := (a - vs)/(s - vs) # proportion of the way between
406 // # VERY_SHORT and SHORT thresholds.
407 // tp := vsp + p * (lp/100 - vsp) # the proportion of definite nucleotides
408 // # required for this length of sequence.
409 // minNt := tp * a # the minimum number of definite nucleotide bases
410 // # required for this length of sequence.
412 // We are then essentially returning:
413 // # ntCount >= 55% of allCount and the rest are all nucleotide ambiguity:
414 // ntCount >= tp * allCount && nCount + ntaCount == aaCount
415 // but without going into float/double land
416 long LHS = 100 * allCount
417 * (NUCLEOTIDE_COUNT_SHORT_SEQUENCE
418 - NUCLEOTIDE_COUNT_VERY_SHORT_SEQUENCE)
419 * (ntCount - allCount + 1);
420 long RHS = allCount * (allCount - NUCLEOTIDE_COUNT_VERY_SHORT_SEQUENCE)
421 * (allCount * NUCLEOTIDE_COUNT_PERCENT - 100 * allCount + 100);
426 * Answers true if more than 85% of the sequence residues (ignoring gaps) are
427 * A, G, C, T or U, else false. This is just a heuristic guess and may give a
428 * wrong answer (as AGCT are also amino acid codes).
433 public static final boolean isNucleotide(SequenceI[] seqs)
439 // true if we have seen a nucleotide sequence
441 for (SequenceI seq : seqs)
448 // TODO could possibly make an informed guess just from the first sequence
449 // to save a lengthy calculation
452 // if even one looks like protein, the alignment is protein
460 * Answers true if the character is one of aAcCgGtTuU
465 public static boolean isNucleotide(char c)
467 return isNucleotide(c, false);
471 * includeAmbiguity = true also includes Nucleotide Ambiguity codes
473 public static boolean isNucleotide(char c, boolean includeAmbiguity)
475 char C = Character.toUpperCase(c);
485 if (includeAmbiguity)
487 boolean ambiguity = isNucleotideAmbiguity(C);
495 * Tests *only* nucleotide ambiguity codes (and not definite nucleotide codes)
497 public static boolean isNucleotideAmbiguity(char c)
499 switch (Character.toUpperCase(c))
514 case 'N': // not counting N as nucleotide
519 public static boolean isN(char c)
521 return 'n' == Character.toLowerCase(c);
524 public static boolean isX(char c)
526 return 'x' == Character.toLowerCase(c);
530 * Answers true if every character in the string is one of aAcCgGtTuU, or
531 * (optionally) a gap character (dot, dash, space), else false
537 public static boolean isNucleotideSequence(String s, boolean allowGaps)
539 return isNucleotideSequence(s, allowGaps, false);
542 public static boolean isNucleotideSequence(String s, boolean allowGaps,
543 boolean includeAmbiguous)
549 for (int i = 0; i < s.length(); i++)
551 char c = s.charAt(i);
552 if (!isNucleotide(c, includeAmbiguous))
554 if (!allowGaps || !isGap(c))
564 * Convenience overload of isNucleotide
569 public static boolean isNucleotide(SequenceI[][] seqs)
575 List<SequenceI> flattened = new ArrayList<SequenceI>();
576 for (SequenceI[] ss : seqs)
578 for (SequenceI s : ss)
583 final SequenceI[] oneDArray = flattened
584 .toArray(new SequenceI[flattened.size()]);
585 return isNucleotide(oneDArray);
589 * Compares two residues either case sensitively or case insensitively
590 * depending on the caseSensitive flag
595 * second char to compare with
596 * @param caseSensitive
597 * if true comparison will be case sensitive otherwise its not
600 public static boolean isSameResidue(char c1, char c2,
601 boolean caseSensitive)
603 return caseSensitive ? c1 == c2
604 : Character.toUpperCase(c1) == Character.toUpperCase(c2);