src/jalview/util/Comparison.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.util;
  22
  23 import java.util.ArrayList;
  24 import java.util.List;
  25
  26 import jalview.bin.Cache;
  27 import jalview.bin.Console;
  28 import jalview.datamodel.SequenceI;
  29
  30 /**
  31  * Assorted methods for analysing or comparing sequences.
  32  */
  33 public class Comparison
  34 {
  35   private static final int EIGHTY_FIVE = 85;
  36
  37   private static final int NUCLEOTIDE_COUNT_PERCENT;
  38
  39   private static final int NUCLEOTIDE_COUNT_LONG_SEQUENCE_AMBIGUITY_PERCENT;
  40
  41   private static final int NUCLEOTIDE_COUNT_SHORT_SEQUENCE;
  42
  43   private static final int NUCLEOTIDE_COUNT_VERY_SHORT_SEQUENCE;
  44
  45   public static final char GAP_SPACE = ' ';
  46
  47   public static final char GAP_DOT = '.';
  48
  49   public static final char GAP_DASH = '-';
  50
  51   public static final String GapChars = new String(
  52           new char[]
  53           { GAP_SPACE, GAP_DOT, GAP_DASH });
  54
  55   static
  56   {
  57     NUCLEOTIDE_COUNT_PERCENT = Cache.getDefault("NUCLEOTIDE_COUNT_PERCENT",
  58             55);
  59     NUCLEOTIDE_COUNT_LONG_SEQUENCE_AMBIGUITY_PERCENT = Cache.getDefault(
  60             "NUCLEOTIDE_COUNT_LONG_SEQUENCE_AMBIGUITY_PERCENT", 95);
  61     NUCLEOTIDE_COUNT_SHORT_SEQUENCE = Cache
  62             .getDefault("NUCLEOTIDE_COUNT_SHORT", 100);
  63     NUCLEOTIDE_COUNT_VERY_SHORT_SEQUENCE = Cache
  64             .getDefault("NUCLEOTIDE_COUNT_VERY_SHORT", 4);
  65   }
  66
  67   /**
  68    * DOCUMENT ME!
  69    *
  70    * @param ii
  71    *          DOCUMENT ME!
  72    * @param jj
  73    *          DOCUMENT ME!
  74    *
  75    * @return DOCUMENT ME!
  76    */
  77   public static final float compare(SequenceI ii, SequenceI jj)
  78   {
  79     return Comparison.compare(ii, jj, 0, ii.getLength() - 1);
  80   }
  81
  82   /**
  83    * this was supposed to be an ungapped pid calculation
  84    *
  85    * @param ii
  86    *          SequenceI
  87    * @param jj
  88    *          SequenceI
  89    * @param start
  90    *          int
  91    * @param end
  92    *          int
  93    * @return float
  94    */
  95   public static float compare(SequenceI ii, SequenceI jj, int start,
  96           int end)
  97   {
  98     String si = ii.getSequenceAsString();
  99     String sj = jj.getSequenceAsString();
 100
 101     int ilen = si.length() - 1;
 102     int jlen = sj.length() - 1;
 103
 104     while (Comparison.isGap(si.charAt(start + ilen)))
 105     {
 106       ilen--;
 107     }
 108
 109     while (Comparison.isGap(sj.charAt(start + jlen)))
 110     {
 111       jlen--;
 112     }
 113
 114     int match = 0;
 115     float pid = -1;
 116
 117     if (ilen > jlen)
 118     {
 119       for (int j = 0; j < jlen; j++)
 120       {
 121         if (si.substring(start + j, start + j + 1)
 122                 .equals(sj.substring(start + j, start + j + 1)))
 123         {
 124           match++;
 125         }
 126       }
 127
 128       pid = (float) match / (float) ilen * 100;
 129     }
 130     else
 131     {
 132       for (int j = 0; j < jlen; j++)
 133       {
 134         if (si.substring(start + j, start + j + 1)
 135                 .equals(sj.substring(start + j, start + j + 1)))
 136         {
 137           match++;
 138         }
 139       }
 140
 141       pid = (float) match / (float) jlen * 100;
 142     }
 143
 144     return pid;
 145   }
 146
 147   /**
 148    * this is a gapped PID calculation
 149    *
 150    * @param s1
 151    *          SequenceI
 152    * @param s2
 153    *          SequenceI
 154    * @return float
 155    * @deprecated use PIDModel.computePID()
 156    */
 157   @Deprecated
 158   public final static float PID(String seq1, String seq2)
 159   {
 160     return PID(seq1, seq2, 0, seq1.length());
 161   }
 162
 163   static final int caseShift = 'a' - 'A';
 164
 165   // Another pid with region specification
 166   /**
 167    * @deprecated use PIDModel.computePID()
 168    */
 169   @Deprecated
 170   public final static float PID(String seq1, String seq2, int start,
 171           int end)
 172   {
 173     return PID(seq1, seq2, start, end, true, false);
 174   }
 175
 176   /**
 177    * Calculate percent identity for a pair of sequences over a particular range,
 178    * with different options for ignoring gaps.
 179    *
 180    * @param seq1
 181    * @param seq2
 182    * @param start
 183    *          - position in seqs
 184    * @param end
 185    *          - position in seqs
 186    * @param wcGaps
 187    *          - if true - gaps match any character, if false, do not match
 188    *          anything
 189    * @param ungappedOnly
 190    *          - if true - only count PID over ungapped columns
 191    * @return
 192    * @deprecated use PIDModel.computePID()
 193    */
 194   @Deprecated
 195   public final static float PID(String seq1, String seq2, int start,
 196           int end, boolean wcGaps, boolean ungappedOnly)
 197   {
 198     int s1len = seq1.length();
 199     int s2len = seq2.length();
 200
 201     int len = Math.min(s1len, s2len);
 202
 203     if (end < len)
 204     {
 205       len = end;
 206     }
 207
 208     if (len < start)
 209     {
 210       start = len - 1; // we just use a single residue for the difference
 211     }
 212
 213     int elen = len - start, bad = 0;
 214     char chr1;
 215     char chr2;
 216     boolean agap;
 217     for (int i = start; i < len; i++)
 218     {
 219       chr1 = seq1.charAt(i);
 220
 221       chr2 = seq2.charAt(i);
 222       agap = isGap(chr1) || isGap(chr2);
 223       if ('a' <= chr1 && chr1 <= 'z')
 224       {
 225         // TO UPPERCASE !!!
 226         // Faster than toUpperCase
 227         chr1 -= caseShift;
 228       }
 229       if ('a' <= chr2 && chr2 <= 'z')
 230       {
 231         // TO UPPERCASE !!!
 232         // Faster than toUpperCase
 233         chr2 -= caseShift;
 234       }
 235
 236       if (chr1 != chr2)
 237       {
 238         if (agap)
 239         {
 240           if (ungappedOnly)
 241           {
 242             elen--;
 243           }
 244           else if (!wcGaps)
 245           {
 246             bad++;
 247           }
 248         }
 249         else
 250         {
 251           bad++;
 252         }
 253       }
 254
 255     }
 256     if (elen < 1)
 257     {
 258       return 0f;
 259     }
 260     return ((float) 100 * (elen - bad)) / elen;
 261   }
 262
 263   /**
 264    * Answers true if the supplied character is a recognised gap character, else
 265    * false. Currently hard-coded to recognise '-', '-' or ' ' (hyphen / dot /
 266    * space).
 267    *
 268    * @param c
 269    *
 270    * @return
 271    */
 272   public static final boolean isGap(char c)
 273   {
 274     return c == GAP_DASH || c == GAP_DOT || c == GAP_SPACE;
 275   }
 276
 277   /**
 278    * Overloaded method signature to test whether a single sequence is nucleotide
 279    * (that is, more than 85% CGTAUNX)
 280    *
 281    * @param seq
 282    * @return
 283    */
 284   public static final boolean isNucleotide(SequenceI seq)
 285   {
 286     if (seq == null || seq.getLength() == 0)
 287     {
 288       return false;
 289     }
 290     long ntCount = 0; // nucleotide symbol count (does not include ntaCount)
 291     long aaCount = 0; // non-nucleotide, non-gap symbol count (includes nCount
 292                       // and ntaCount)
 293     long nCount = 0; // "Unknown" (N) symbol count
 294     long xCount = 0; // Also used as "Unknown" (X) symbol count
 295     long ntaCount = 0; // nucleotide ambiguity symbol count
 296
 297     int len = seq.getLength();
 298     for (int i = 0; i < len; i++)
 299     {
 300       char c = seq.getCharAt(i);
 301       if (isNucleotide(c))
 302       {
 303         ntCount++;
 304       }
 305       else if (!isGap(c))
 306       {
 307         aaCount++;
 308         if (isN(c))
 309         {
 310           nCount++;
 311         }
 312         else
 313         {
 314           if (isX(c))
 315           {
 316             xCount++;
 317           }
 318           if (isNucleotideAmbiguity(c))
 319           {
 320             ntaCount++;
 321           }
 322         }
 323       }
 324     }
 325     long allCount = ntCount + aaCount;
 326
 327     if (Cache.getDefault("NUCLEOTIDE_AMBIGUITY_DETECTION", true))
 328     {
 329       Console.info("Performing new nucleotide detection routine");
 330       if (allCount > NUCLEOTIDE_COUNT_SHORT_SEQUENCE)
 331       {
 332         // a long sequence.
 333         // check for at least 55% nucleotide, and nucleotide and ambiguity codes
 334         // (including N) must make up 95%
 335         return ntCount * 100 > NUCLEOTIDE_COUNT_PERCENT * allCount
 336                 && 100 * (ntCount + nCount
 337                         + ntaCount) > NUCLEOTIDE_COUNT_LONG_SEQUENCE_AMBIGUITY_PERCENT
 338                                 * allCount;
 339       }
 340       else if (allCount > NUCLEOTIDE_COUNT_VERY_SHORT_SEQUENCE)
 341       {
 342         // a short sequence.
 343         // check if a short sequence is at least 55% nucleotide and the rest of
 344         // the symbols are all X or all N
 345         if (ntCount * 100 > NUCLEOTIDE_COUNT_PERCENT * allCount
 346                 && (nCount == aaCount || xCount == aaCount))
 347         {
 348           return true;
 349         }
 350
 351         // a short sequence.
 352         // check for at least x% nucleotide and all the rest nucleotide
 353         // ambiguity codes (including N), where x slides from 75% for sequences
 354         // of length 4 (i.e. only one non-nucleotide) to 55% for sequences of
 355         // length 100
 356         return myShortSequenceNucleotideProportionCount(ntCount, allCount)
 357                 && nCount + ntaCount == aaCount;
 358       }
 359       else
 360       {
 361         // a very short sequence. (<4)
 362         // all bases must be nucleotide
 363         return ntCount > 0 && ntCount == allCount;
 364       }
 365     }
 366     else
 367     {
 368       Console.info("Performing old nucleotide detection routine");
 369       /*
 370        * Check for nucleotide count > 85% of total count (in a form that evades
 371        * int / float conversion or divide by zero).
 372        */
 373       if ((ntCount + nCount) * 100 > EIGHTY_FIVE * allCount)
 374       {
 375         return ntCount > 0; // all N is considered protein. Could use a
 376                             // threshold here too
 377       }
 378     }
 379     return false;
 380   }
 381
 382   protected static boolean myShortSequenceNucleotideProportionCount(
 383           long ntCount, long allCount)
 384   {
 385     /**
 386      * this method is valid only for NUCLEOTIDE_COUNT_VERY_SHORT_SEQUENCE <=
 387      * allCount <= NUCLEOTIDE_COUNT_SHORT_SEQUENCE
 388      */
 389     // the following is a simplified integer version of:
 390     //
 391     // a := allCount # the number of bases in the sequence
 392     // n : = ntCount # the number of definite nucleotide bases
 393     // vs := NUCLEOTIDE_COUNT_VERY_SHORT_SEQUENCE
 394     // s := NUCLEOTIDE_COUNT_SHORT_SEQUENCE
 395     // lp := NUCLEOTIDE_COUNT_LOWER_PERCENT
 396     // vsp := 1 - (1/a) # this is the proportion of required nucleotides in
 397     // # a VERY_SHORT Sequence (4 bases).
 398     // # should be all but one base is nucleotide.
 399     // p := (a - vs)/(s - vs) # proportion of the way between
 400     // # VERY_SHORT and SHORT thresholds.
 401     // tp := vsp + p * (lp/100 - vsp) # the proportion of nucleotides
 402     // # required for this length of sequence.
 403     // minNt := tp * a # the minimum number of definite nucleotide bases
 404     // # required for this length of sequences.
 405     //
 406     // We are then essentially returning:
 407     // # ntCount >= 55% of allCount and the rest are all nucleotide ambiguity
 408     // ntCount >= tp * allCount && nCount + ntaCount == aaCount
 409     // but without going into float/double land
 410     long LHS = 100 * allCount
 411             * (NUCLEOTIDE_COUNT_SHORT_SEQUENCE
 412                     - NUCLEOTIDE_COUNT_VERY_SHORT_SEQUENCE)
 413             * (ntCount - allCount + 1);
 414     long RHS = allCount * (allCount - NUCLEOTIDE_COUNT_VERY_SHORT_SEQUENCE)
 415             * (allCount * NUCLEOTIDE_COUNT_PERCENT - 100 * allCount + 100);
 416     return LHS >= RHS;
 417   }
 418
 419   /**
 420    * Answers true if more than 85% of the sequence residues (ignoring gaps) are
 421    * A, G, C, T or U, else false. This is just a heuristic guess and may give a
 422    * wrong answer (as AGCT are also amino acid codes).
 423    *
 424    * @param seqs
 425    * @return
 426    */
 427   public static final boolean isNucleotide(SequenceI[] seqs)
 428   {
 429     if (seqs == null)
 430     {
 431       return false;
 432     }
 433     // true if we have seen a nucleotide sequence
 434     boolean na = false;
 435     for (SequenceI seq : seqs)
 436     {
 437       if (seq == null)
 438       {
 439         continue;
 440       }
 441       na = true;
 442       // TODO could possibly make an informed guess just from the first sequence
 443       // to save a lengthy calculation
 444       if (seq.isProtein())
 445       {
 446         // if even one looks like protein, the alignment is protein
 447         return false;
 448       }
 449     }
 450     return na;
 451   }
 452
 453   /**
 454    * Answers true if the character is one of aAcCgGtTuU
 455    *
 456    * @param c
 457    * @return
 458    */
 459   public static boolean isNucleotide(char c)
 460   {
 461     return isNucleotide(c, false);
 462   }
 463
 464   /**
 465    * includeAmbiguity = true also includes Nucleotide Ambiguity codes
 466    */
 467   public static boolean isNucleotide(char c, boolean includeAmbiguity)
 468   {
 469     char C = Character.toUpperCase(c);
 470     switch (C)
 471     {
 472     case 'A':
 473     case 'C':
 474     case 'G':
 475     case 'T':
 476     case 'U':
 477       return true;
 478     }
 479     if (includeAmbiguity)
 480     {
 481       boolean ambiguity = isNucleotideAmbiguity(C);
 482       if (ambiguity)
 483         return true;
 484     }
 485     return false;
 486   }
 487
 488   /**
 489    * Tests *only* nucleotide ambiguity codes (and not definite nucleotide codes)
 490    */
 491   public static boolean isNucleotideAmbiguity(char c)
 492   {
 493     switch (Character.toUpperCase(c))
 494     {
 495     case 'I':
 496     case 'X':
 497     case 'R':
 498     case 'Y':
 499     case 'W':
 500     case 'S':
 501     case 'M':
 502     case 'K':
 503     case 'B':
 504     case 'H':
 505     case 'D':
 506     case 'V':
 507       return true;
 508     case 'N': // not counting N as nucleotide
 509     }
 510     return false;
 511   }
 512
 513   public static boolean isN(char c)
 514   {
 515     return 'n' == Character.toLowerCase(c);
 516   }
 517
 518   public static boolean isX(char c)
 519   {
 520     return 'x' == Character.toLowerCase(c);
 521   }
 522
 523   /**
 524    * Answers true if every character in the string is one of aAcCgGtTuU, or
 525    * (optionally) a gap character (dot, dash, space), else false
 526    *
 527    * @param s
 528    * @param allowGaps
 529    * @return
 530    */
 531   public static boolean isNucleotideSequence(String s, boolean allowGaps)
 532   {
 533     return isNucleotideSequence(s, allowGaps, false);
 534   }
 535
 536   public static boolean isNucleotideSequence(String s, boolean allowGaps,
 537           boolean includeAmbiguous)
 538   {
 539     if (s == null)
 540     {
 541       return false;
 542     }
 543     for (int i = 0; i < s.length(); i++)
 544     {
 545       char c = s.charAt(i);
 546       if (!isNucleotide(c, includeAmbiguous))
 547       {
 548         if (!allowGaps || !isGap(c))
 549         {
 550           return false;
 551         }
 552       }
 553     }
 554     return true;
 555   }
 556
 557   /**
 558    * Convenience overload of isNucleotide
 559    *
 560    * @param seqs
 561    * @return
 562    */
 563   public static boolean isNucleotide(SequenceI[][] seqs)
 564   {
 565     if (seqs == null)
 566     {
 567       return false;
 568     }
 569     List<SequenceI> flattened = new ArrayList<SequenceI>();
 570     for (SequenceI[] ss : seqs)
 571     {
 572       for (SequenceI s : ss)
 573       {
 574         flattened.add(s);
 575       }
 576     }
 577     final SequenceI[] oneDArray = flattened
 578             .toArray(new SequenceI[flattened.size()]);
 579     return isNucleotide(oneDArray);
 580   }
 581
 582   /**
 583    * Compares two residues either case sensitively or case insensitively
 584    * depending on the caseSensitive flag
 585    *
 586    * @param c1
 587    *          first char
 588    * @param c2
 589    *          second char to compare with
 590    * @param caseSensitive
 591    *          if true comparison will be case sensitive otherwise its not
 592    * @return
 593    */
 594   public static boolean isSameResidue(char c1, char c2,
 595           boolean caseSensitive)
 596   {
 597     return caseSensitive ? c1 == c2
 598             : Character.toUpperCase(c1) == Character.toUpperCase(c2);
 599   }
 600 }