src/jalview/util/Comparison.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.util;
  22
  23 import java.util.ArrayList;
  24 import java.util.List;
  25
  26 import jalview.bin.Cache;
  27 import jalview.bin.Console;
  28 import jalview.datamodel.SequenceI;
  29
  30 /**
  31  * Assorted methods for analysing or comparing sequences.
  32  */
  33 public class Comparison
  34 {
  35   private static final int EIGHTY_FIVE = 85;
  36
  37   private static final int NUCLEOTIDE_COUNT_PERCENT;
  38
  39   private static final int NUCLEOTIDE_COUNT_LONG_SEQUENCE_AMBIGUITY_PERCENT;
  40
  41   private static final int NUCLEOTIDE_COUNT_SHORT_SEQUENCE;
  42
  43   private static final int NUCLEOTIDE_COUNT_VERY_SHORT_SEQUENCE;
  44
  45   private static final boolean NUCLEOTIDE_AMBIGUITY_DETECTION;
  46
  47   public static final char GAP_SPACE = ' ';
  48
  49   public static final char GAP_DOT = '.';
  50
  51   public static final char GAP_DASH = '-';
  52
  53   public static final String GapChars = new String(
  54           new char[]
  55           { GAP_SPACE, GAP_DOT, GAP_DASH });
  56
  57   static
  58   {
  59     // these options read only at start of session
  60     NUCLEOTIDE_COUNT_PERCENT = Cache.getDefault("NUCLEOTIDE_COUNT_PERCENT",
  61             55);
  62     NUCLEOTIDE_COUNT_LONG_SEQUENCE_AMBIGUITY_PERCENT = Cache.getDefault(
  63             "NUCLEOTIDE_COUNT_LONG_SEQUENCE_AMBIGUITY_PERCENT", 95);
  64     NUCLEOTIDE_COUNT_SHORT_SEQUENCE = Cache
  65             .getDefault("NUCLEOTIDE_COUNT_SHORT", 100);
  66     NUCLEOTIDE_COUNT_VERY_SHORT_SEQUENCE = Cache
  67             .getDefault("NUCLEOTIDE_COUNT_VERY_SHORT", 4);
  68     NUCLEOTIDE_AMBIGUITY_DETECTION = Cache
  69             .getDefault("NUCLEOTIDE_AMBIGUITY_DETECTION", true);
  70   }
  71
  72   /**
  73    * DOCUMENT ME!
  74    *
  75    * @param ii
  76    *          DOCUMENT ME!
  77    * @param jj
  78    *          DOCUMENT ME!
  79    *
  80    * @return DOCUMENT ME!
  81    */
  82   public static final float compare(SequenceI ii, SequenceI jj)
  83   {
  84     return Comparison.compare(ii, jj, 0, ii.getLength() - 1);
  85   }
  86
  87   /**
  88    * this was supposed to be an ungapped pid calculation
  89    *
  90    * @param ii
  91    *          SequenceI
  92    * @param jj
  93    *          SequenceI
  94    * @param start
  95    *          int
  96    * @param end
  97    *          int
  98    * @return float
  99    */
 100   public static float compare(SequenceI ii, SequenceI jj, int start,
 101           int end)
 102   {
 103     String si = ii.getSequenceAsString();
 104     String sj = jj.getSequenceAsString();
 105
 106     int ilen = si.length() - 1;
 107     int jlen = sj.length() - 1;
 108
 109     while (Comparison.isGap(si.charAt(start + ilen)))
 110     {
 111       ilen--;
 112     }
 113
 114     while (Comparison.isGap(sj.charAt(start + jlen)))
 115     {
 116       jlen--;
 117     }
 118
 119     int match = 0;
 120     float pid = -1;
 121
 122     if (ilen > jlen)
 123     {
 124       for (int j = 0; j < jlen; j++)
 125       {
 126         if (si.substring(start + j, start + j + 1)
 127                 .equals(sj.substring(start + j, start + j + 1)))
 128         {
 129           match++;
 130         }
 131       }
 132
 133       pid = (float) match / (float) ilen * 100;
 134     }
 135     else
 136     {
 137       for (int j = 0; j < jlen; j++)
 138       {
 139         if (si.substring(start + j, start + j + 1)
 140                 .equals(sj.substring(start + j, start + j + 1)))
 141         {
 142           match++;
 143         }
 144       }
 145
 146       pid = (float) match / (float) jlen * 100;
 147     }
 148
 149     return pid;
 150   }
 151
 152   /**
 153    * this is a gapped PID calculation
 154    *
 155    * @param s1
 156    *          SequenceI
 157    * @param s2
 158    *          SequenceI
 159    * @return float
 160    * @deprecated use PIDModel.computePID()
 161    */
 162   @Deprecated
 163   public final static float PID(String seq1, String seq2)
 164   {
 165     return PID(seq1, seq2, 0, seq1.length());
 166   }
 167
 168   static final int caseShift = 'a' - 'A';
 169
 170   // Another pid with region specification
 171   /**
 172    * @deprecated use PIDModel.computePID()
 173    */
 174   @Deprecated
 175   public final static float PID(String seq1, String seq2, int start,
 176           int end)
 177   {
 178     return PID(seq1, seq2, start, end, true, false);
 179   }
 180
 181   /**
 182    * Calculate percent identity for a pair of sequences over a particular range,
 183    * with different options for ignoring gaps.
 184    *
 185    * @param seq1
 186    * @param seq2
 187    * @param start
 188    *          - position in seqs
 189    * @param end
 190    *          - position in seqs
 191    * @param wcGaps
 192    *          - if true - gaps match any character, if false, do not match
 193    *          anything
 194    * @param ungappedOnly
 195    *          - if true - only count PID over ungapped columns
 196    * @return
 197    * @deprecated use PIDModel.computePID()
 198    */
 199   @Deprecated
 200   public final static float PID(String seq1, String seq2, int start,
 201           int end, boolean wcGaps, boolean ungappedOnly)
 202   {
 203     int s1len = seq1.length();
 204     int s2len = seq2.length();
 205
 206     int len = Math.min(s1len, s2len);
 207
 208     if (end < len)
 209     {
 210       len = end;
 211     }
 212
 213     if (len < start)
 214     {
 215       start = len - 1; // we just use a single residue for the difference
 216     }
 217
 218     int elen = len - start, bad = 0;
 219     char chr1;
 220     char chr2;
 221     boolean agap;
 222     for (int i = start; i < len; i++)
 223     {
 224       chr1 = seq1.charAt(i);
 225
 226       chr2 = seq2.charAt(i);
 227       agap = isGap(chr1) || isGap(chr2);
 228       if ('a' <= chr1 && chr1 <= 'z')
 229       {
 230         // TO UPPERCASE !!!
 231         // Faster than toUpperCase
 232         chr1 -= caseShift;
 233       }
 234       if ('a' <= chr2 && chr2 <= 'z')
 235       {
 236         // TO UPPERCASE !!!
 237         // Faster than toUpperCase
 238         chr2 -= caseShift;
 239       }
 240
 241       if (chr1 != chr2)
 242       {
 243         if (agap)
 244         {
 245           if (ungappedOnly)
 246           {
 247             elen--;
 248           }
 249           else if (!wcGaps)
 250           {
 251             bad++;
 252           }
 253         }
 254         else
 255         {
 256           bad++;
 257         }
 258       }
 259
 260     }
 261     if (elen < 1)
 262     {
 263       return 0f;
 264     }
 265     return ((float) 100 * (elen - bad)) / elen;
 266   }
 267
 268   /**
 269    * Answers true if the supplied character is a recognised gap character, else
 270    * false. Currently hard-coded to recognise '-', '-' or ' ' (hyphen / dot /
 271    * space).
 272    *
 273    * @param c
 274    *
 275    * @return
 276    */
 277   public static final boolean isGap(char c)
 278   {
 279     return c == GAP_DASH || c == GAP_DOT || c == GAP_SPACE;
 280   }
 281
 282   /**
 283    * Overloaded method signature to test whether a single sequence is nucleotide
 284    * (that is, more than 85% CGTAUNX)
 285    *
 286    * @param seq
 287    * @return
 288    */
 289   public static final boolean isNucleotide(SequenceI seq)
 290   {
 291     if (seq == null || seq.getLength() == 0)
 292     {
 293       return false;
 294     }
 295     long ntCount = 0; // nucleotide symbol count (does not include ntaCount)
 296     long aaCount = 0; // non-nucleotide, non-gap symbol count (includes nCount
 297                       // and ntaCount)
 298     long nCount = 0; // "Unknown" (N) symbol count
 299     long xCount = 0; // Also used as "Unknown" (X) symbol count
 300     long ntaCount = 0; // nucleotide ambiguity symbol count
 301
 302     int len = seq.getLength();
 303     for (int i = 0; i < len; i++)
 304     {
 305       char c = seq.getCharAt(i);
 306       if (isNucleotide(c))
 307       {
 308         ntCount++;
 309       }
 310       else if (!isGap(c))
 311       {
 312         aaCount++;
 313         if (isN(c))
 314         {
 315           nCount++;
 316         }
 317         else
 318         {
 319           if (isX(c))
 320           {
 321             xCount++;
 322           }
 323           if (isNucleotideAmbiguity(c))
 324           {
 325             ntaCount++;
 326           }
 327         }
 328       }
 329     }
 330     long allCount = ntCount + aaCount;
 331
 332     if (NUCLEOTIDE_AMBIGUITY_DETECTION)
 333     {
 334       Console.debug("Performing new nucleotide detection routine");
 335       if (allCount > NUCLEOTIDE_COUNT_SHORT_SEQUENCE)
 336       {
 337         // a long sequence.
 338         // check for at least 55% nucleotide, and nucleotide and ambiguity codes
 339         // (including N) must make up 95%
 340         return ntCount * 100 > NUCLEOTIDE_COUNT_PERCENT * allCount
 341                 && 100 * (ntCount + nCount
 342                         + ntaCount) > NUCLEOTIDE_COUNT_LONG_SEQUENCE_AMBIGUITY_PERCENT
 343                                 * allCount;
 344       }
 345       else if (allCount > NUCLEOTIDE_COUNT_VERY_SHORT_SEQUENCE)
 346       {
 347         // a short sequence.
 348         // check if a short sequence is at least 55% nucleotide and the rest of
 349         // the symbols are all X or all N
 350         if (ntCount * 100 > NUCLEOTIDE_COUNT_PERCENT * allCount
 351                 && (nCount == aaCount || xCount == aaCount))
 352         {
 353           return true;
 354         }
 355
 356         // a short sequence.
 357         // check for at least x% nucleotide and all the rest nucleotide
 358         // ambiguity codes (including N), where x slides from 75% for sequences
 359         // of length 4 (i.e. only one non-nucleotide) to 55% for sequences of
 360         // length 100
 361         return myShortSequenceNucleotideProportionCount(ntCount, allCount)
 362                 && nCount + ntaCount == aaCount;
 363       }
 364       else
 365       {
 366         // a very short sequence. (<4)
 367         // all bases must be nucleotide
 368         return ntCount > 0 && ntCount == allCount;
 369       }
 370     }
 371     else
 372     {
 373       Console.debug("Performing old nucleotide detection routine");
 374       /*
 375        * Check for nucleotide count > 85% of total count (in a form that evades
 376        * int / float conversion or divide by zero).
 377        */
 378       if ((ntCount + nCount) * 100 > EIGHTY_FIVE * allCount)
 379       {
 380         return ntCount > 0; // all N is considered protein. Could use a
 381                             // threshold here too
 382       }
 383     }
 384     return false;
 385   }
 386
 387   protected static boolean myShortSequenceNucleotideProportionCount(
 388           long ntCount, long allCount)
 389   {
 390     /**
 391      * this method is valid only for NUCLEOTIDE_COUNT_VERY_SHORT_SEQUENCE <=
 392      * allCount <= NUCLEOTIDE_COUNT_SHORT_SEQUENCE
 393      */
 394     // the following is a simplified integer version of:
 395     //
 396     // a := allCount # the number of bases in the sequence
 397     // n : = ntCount # the number of definite nucleotide bases
 398     // vs := NUCLEOTIDE_COUNT_VERY_SHORT_SEQUENCE
 399     // s := NUCLEOTIDE_COUNT_SHORT_SEQUENCE
 400     // lp := NUCLEOTIDE_COUNT_LOWER_PERCENT
 401     // vsp := 1 - (1/a) # this is the proportion of required definite
 402     // nucleotides
 403     // # in a VERY_SHORT Sequence (4 bases).
 404     // # This should be equivalent to all but one base in the sequence.
 405     // p := (a - vs)/(s - vs) # proportion of the way between
 406     // # VERY_SHORT and SHORT thresholds.
 407     // tp := vsp + p * (lp/100 - vsp) # the proportion of definite nucleotides
 408     // # required for this length of sequence.
 409     // minNt := tp * a # the minimum number of definite nucleotide bases
 410     // # required for this length of sequence.
 411     //
 412     // We are then essentially returning:
 413     // # ntCount >= 55% of allCount and the rest are all nucleotide ambiguity:
 414     // ntCount >= tp * allCount && nCount + ntaCount == aaCount
 415     // but without going into float/double land
 416     long LHS = 100 * allCount
 417             * (NUCLEOTIDE_COUNT_SHORT_SEQUENCE
 418                     - NUCLEOTIDE_COUNT_VERY_SHORT_SEQUENCE)
 419             * (ntCount - allCount + 1);
 420     long RHS = allCount * (allCount - NUCLEOTIDE_COUNT_VERY_SHORT_SEQUENCE)
 421             * (allCount * NUCLEOTIDE_COUNT_PERCENT - 100 * allCount + 100);
 422     return LHS >= RHS;
 423   }
 424
 425   /**
 426    * Answers true if more than 85% of the sequence residues (ignoring gaps) are
 427    * A, G, C, T or U, else false. This is just a heuristic guess and may give a
 428    * wrong answer (as AGCT are also amino acid codes).
 429    *
 430    * @param seqs
 431    * @return
 432    */
 433   public static final boolean isNucleotide(SequenceI[] seqs)
 434   {
 435     if (seqs == null)
 436     {
 437       return false;
 438     }
 439     // true if we have seen a nucleotide sequence
 440     boolean na = false;
 441     for (SequenceI seq : seqs)
 442     {
 443       if (seq == null)
 444       {
 445         continue;
 446       }
 447       na = true;
 448       // TODO could possibly make an informed guess just from the first sequence
 449       // to save a lengthy calculation
 450       if (seq.isProtein())
 451       {
 452         // if even one looks like protein, the alignment is protein
 453         return false;
 454       }
 455     }
 456     return na;
 457   }
 458
 459   /**
 460    * Answers true if the character is one of aAcCgGtTuU
 461    *
 462    * @param c
 463    * @return
 464    */
 465   public static boolean isNucleotide(char c)
 466   {
 467     return isNucleotide(c, false);
 468   }
 469
 470   /**
 471    * includeAmbiguity = true also includes Nucleotide Ambiguity codes
 472    */
 473   public static boolean isNucleotide(char c, boolean includeAmbiguity)
 474   {
 475     char C = Character.toUpperCase(c);
 476     switch (C)
 477     {
 478     case 'A':
 479     case 'C':
 480     case 'G':
 481     case 'T':
 482     case 'U':
 483       return true;
 484     }
 485     if (includeAmbiguity)
 486     {
 487       boolean ambiguity = isNucleotideAmbiguity(C);
 488       if (ambiguity)
 489         return true;
 490     }
 491     return false;
 492   }
 493
 494   /**
 495    * Tests *only* nucleotide ambiguity codes (and not definite nucleotide codes)
 496    */
 497   public static boolean isNucleotideAmbiguity(char c)
 498   {
 499     switch (Character.toUpperCase(c))
 500     {
 501     case 'I':
 502     case 'X':
 503     case 'R':
 504     case 'Y':
 505     case 'W':
 506     case 'S':
 507     case 'M':
 508     case 'K':
 509     case 'B':
 510     case 'H':
 511     case 'D':
 512     case 'V':
 513       return true;
 514     case 'N': // not counting N as nucleotide
 515     }
 516     return false;
 517   }
 518
 519   public static boolean isN(char c)
 520   {
 521     return 'n' == Character.toLowerCase(c);
 522   }
 523
 524   public static boolean isX(char c)
 525   {
 526     return 'x' == Character.toLowerCase(c);
 527   }
 528
 529   /**
 530    * Answers true if every character in the string is one of aAcCgGtTuU, or
 531    * (optionally) a gap character (dot, dash, space), else false
 532    *
 533    * @param s
 534    * @param allowGaps
 535    * @return
 536    */
 537   public static boolean isNucleotideSequence(String s, boolean allowGaps)
 538   {
 539     return isNucleotideSequence(s, allowGaps, false);
 540   }
 541
 542   public static boolean isNucleotideSequence(String s, boolean allowGaps,
 543           boolean includeAmbiguous)
 544   {
 545     if (s == null)
 546     {
 547       return false;
 548     }
 549     for (int i = 0; i < s.length(); i++)
 550     {
 551       char c = s.charAt(i);
 552       if (!isNucleotide(c, includeAmbiguous))
 553       {
 554         if (!allowGaps || !isGap(c))
 555         {
 556           return false;
 557         }
 558       }
 559     }
 560     return true;
 561   }
 562
 563   /**
 564    * Convenience overload of isNucleotide
 565    *
 566    * @param seqs
 567    * @return
 568    */
 569   public static boolean isNucleotide(SequenceI[][] seqs)
 570   {
 571     if (seqs == null)
 572     {
 573       return false;
 574     }
 575     List<SequenceI> flattened = new ArrayList<SequenceI>();
 576     for (SequenceI[] ss : seqs)
 577     {
 578       for (SequenceI s : ss)
 579       {
 580         flattened.add(s);
 581       }
 582     }
 583     final SequenceI[] oneDArray = flattened
 584             .toArray(new SequenceI[flattened.size()]);
 585     return isNucleotide(oneDArray);
 586   }
 587
 588   /**
 589    * Compares two residues either case sensitively or case insensitively
 590    * depending on the caseSensitive flag
 591    *
 592    * @param c1
 593    *          first char
 594    * @param c2
 595    *          second char to compare with
 596    * @param caseSensitive
 597    *          if true comparison will be case sensitive otherwise its not
 598    * @return
 599    */
 600   public static boolean isSameResidue(char c1, char c2,
 601           boolean caseSensitive)
 602   {
 603     return caseSensitive ? c1 == c2
 604             : Character.toUpperCase(c1) == Character.toUpperCase(c2);
 605   }
 606 }